[PATCH 3/7] clang-format: Apply clang-format

author Johannes Demel <demel@uni-bremen.de>

Sun, 23 Feb 2020 14:03:47 +0000 (15:03 +0100)

committer A. Maitland Bottoms <bottoms@debian.org>

Sat, 28 Mar 2020 01:48:10 +0000 (01:48 +0000)
author Johannes Demel <demel@uni-bremen.de>
Sun, 23 Feb 2020 14:03:47 +0000 (15:03 +0100)
committer A. Maitland Bottoms <bottoms@debian.org>
Sat, 28 Mar 2020 01:48:10 +0000 (01:48 +0000)
diff --git a/.clang-format b/.clang-format

new file mode 100644 (file)

index 0000000..285b68d
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,106 @@
+---
+Language: Cpp
+# BasedOnStyle: LLVM
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BreakBeforeBraces: Custom
+BraceWrapping:
+  AfterClass:      true
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   true
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     90
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeCategories:
+  - Regex:           '^"(gnuradio)/'
+    Priority:        1
+  - Regex:           '^<(gnuradio)/'
+    Priority:        2
+  - Regex:           '^<(boost)/'
+    Priority:        98
+  - Regex:           '^<[a-z]*>$'
+    Priority:        99
+  - Regex:           '^".*"$'
+    Priority:        0
+  - Regex:           '.*'
+    Priority:        10
+
+IncludeIsMainRegex: '(Test)?$'
+IndentCaseLabels: false
+IndentWidth: 4
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 2
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    true
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        8
+UseTab:          Never
+
+
diff --git a/apps/volk-config-info.cc b/apps/volk-config-info.cc

index 4eedcb7b6cdb190c7d3610e37285f2eac80d047d..25219939f6dcc70056407dfc35eb457b2c090878 100644 (file)
--- a/apps/volk-config-info.cc
+++ b/apps/volk-config-info.cc
@@ -24,52 +24,63 @@
  #include <config.h>
  #endif
  
-#include <volk/constants.h>       // for volk_available_machines, volk_c_com...
-#include <iostream>               // for operator<<, endl, cout, ostream
-#include <string>                 // for string
+#include <volk/constants.h> // for volk_available_machines, volk_c_com...
+#include <iostream>         // for operator<<, endl, cout, ostream
+#include <string>           // for string
  
-#include "volk/volk.h"            // for volk_get_alignment, volk_get_machine
-#include "volk_option_helpers.h"  // for option_list, option_t
+#include "volk/volk.h"           // for volk_get_alignment, volk_get_machine
+#include "volk_option_helpers.h" // for option_list, option_t
  
  void print_alignment()
  {
-  std::cout << "Alignment in bytes: " << volk_get_alignment() << std::endl;
+    std::cout << "Alignment in bytes: " << volk_get_alignment() << std::endl;
  }
  
  void print_malloc()
  {
-  // You don't want to change the volk_malloc code, so just copy the if/else
-  // structure from there and give an explanation for the implementations
-  std::cout << "Used malloc implementation: ";
-  #if HAVE_POSIX_MEMALIGN
-  std::cout << "posix_memalign" << std::endl;
-  #elif defined(_MSC_VER)
-  std::cout << "_aligned_malloc" << std::endl;
-  #else
-  std::cout << "C11 aligned_alloc" << std::endl;
-  #endif
+    // You don't want to change the volk_malloc code, so just copy the if/else
+    // structure from there and give an explanation for the implementations
+    std::cout << "Used malloc implementation: ";
+#if HAVE_POSIX_MEMALIGN
+    std::cout << "posix_memalign" << std::endl;
+#elif defined(_MSC_VER)
+    std::cout << "_aligned_malloc" << std::endl;
+#else
+    std::cout << "C11 aligned_alloc" << std::endl;
+#endif
  }
  
  
-int
-main(int argc, char **argv)
+int main(int argc, char** argv)
  {
  
-  option_list our_options("volk-config-info");
-  our_options.add(option_t("prefix", "", "print the VOLK installation prefix", volk_prefix()));
-  our_options.add(option_t("cc", "", "print the VOLK C compiler version", volk_c_compiler()));
-  our_options.add(option_t("cflags", "", "print the VOLK CFLAGS", volk_compiler_flags()));
-  our_options.add(option_t("all-machines", "", "print VOLK machines built", volk_available_machines()));
-  our_options.add(option_t("avail-machines", "", "print VOLK machines on the current "
-      "platform", volk_list_machines));
-  our_options.add(option_t("machine", "", "print the current VOLK machine that will be used",
-                           volk_get_machine()));
-  our_options.add(option_t("alignment", "", "print the memory alignment", print_alignment));
-  our_options.add(option_t("malloc", "", "print the malloc implementation used in volk_malloc",
-                           print_malloc));
-  our_options.add(option_t("version", "v", "print the VOLK version", volk_version()));
+    option_list our_options("volk-config-info");
+    our_options.add(
+        option_t("prefix", "", "print the VOLK installation prefix", volk_prefix()));
+    our_options.add(
+        option_t("cc", "", "print the VOLK C compiler version", volk_c_compiler()));
+    our_options.add(
+        option_t("cflags", "", "print the VOLK CFLAGS", volk_compiler_flags()));
+    our_options.add(option_t(
+        "all-machines", "", "print VOLK machines built", volk_available_machines()));
+    our_options.add(option_t("avail-machines",
+                             "",
+                             "print VOLK machines on the current "
+                             "platform",
+                             volk_list_machines));
+    our_options.add(option_t("machine",
+                             "",
+                             "print the current VOLK machine that will be used",
+                             volk_get_machine()));
+    our_options.add(
+        option_t("alignment", "", "print the memory alignment", print_alignment));
+    our_options.add(option_t("malloc",
+                             "",
+                             "print the malloc implementation used in volk_malloc",
+                             print_malloc));
+    our_options.add(option_t("version", "v", "print the VOLK version", volk_version()));
  
-  our_options.parse(argc, argv);
+    our_options.parse(argc, argv);
  
-  return 0;
+    return 0;
  }
diff --git a/apps/volk_option_helpers.cc b/apps/volk_option_helpers.cc

index 4299709dbda76f762a566ec6a84281010a868545..73d51da207698efb28b01fe952d98177db5ec40a 100644 (file)
--- a/apps/volk_option_helpers.cc
+++ b/apps/volk_option_helpers.cc
@@ -4,66 +4,97 @@
  
  #include "volk_option_helpers.h"
  
-#include <exception>  // for exception
-#include <iostream>   // for operator<<, endl, basic_ostream, cout, ostream
-#include <utility>    // for pair
-#include <limits.h>   // IWYU pragma: keep
-#include <cstring>    // IWYU pragma: keep
-#include <cstdlib>      // IWYU pragma: keep
+#include <limits.h>  // IWYU pragma: keep
+#include <cstdlib>   // IWYU pragma: keep
+#include <cstring>   // IWYU pragma: keep
+#include <exception> // for exception
+#include <iostream>  // for operator<<, endl, basic_ostream, cout, ostream
+#include <utility>   // for pair
  
  /*
   * Option type
   */
-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)())
-        : longform("--" + longform),
-          shortform("-" + shortform),
-          msg(msg),
-          callback(callback) { option_type = VOID_CALLBACK; }
-
-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int))
-        : longform("--" + longform),
-          shortform("-" + shortform),
-          msg(msg),
-          callback((void (*)()) callback) { option_type = INT_CALLBACK; }
-
-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(float))
-        : longform("--" + longform),
-          shortform("-" + shortform),
-          msg(msg),
-          callback((void (*)()) callback) { option_type = FLOAT_CALLBACK; }
-
-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(bool))
-        : longform("--" + longform),
-          shortform("-" + shortform),
-          msg(msg),
-          callback((void (*)()) callback) { option_type = BOOL_CALLBACK; }
-
-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(std::string))
-        : longform("--" + longform),
-          shortform("-" + shortform),
-          msg(msg),
-          callback((void (*)()) callback) { option_type = STRING_CALLBACK; }
-
-option_t::option_t(std::string longform, std::string shortform, std::string msg, std::string printval)
-        : longform("--" + longform),
-          shortform("-" + shortform),
-          msg(msg),
-          printval(printval) { option_type = STRING; }
+option_t::option_t(std::string longform,
+                   std::string shortform,
+                   std::string msg,
+                   void (*callback)())
+    : longform("--" + longform), shortform("-" + shortform), msg(msg), callback(callback)
+{
+    option_type = VOID_CALLBACK;
+}
+
+option_t::option_t(std::string longform,
+                   std::string shortform,
+                   std::string msg,
+                   void (*callback)(int))
+    : longform("--" + longform),
+      shortform("-" + shortform),
+      msg(msg),
+      callback((void (*)())callback)
+{
+    option_type = INT_CALLBACK;
+}
+
+option_t::option_t(std::string longform,
+                   std::string shortform,
+                   std::string msg,
+                   void (*callback)(float))
+    : longform("--" + longform),
+      shortform("-" + shortform),
+      msg(msg),
+      callback((void (*)())callback)
+{
+    option_type = FLOAT_CALLBACK;
+}
+
+option_t::option_t(std::string longform,
+                   std::string shortform,
+                   std::string msg,
+                   void (*callback)(bool))
+    : longform("--" + longform),
+      shortform("-" + shortform),
+      msg(msg),
+      callback((void (*)())callback)
+{
+    option_type = BOOL_CALLBACK;
+}
+
+option_t::option_t(std::string longform,
+                   std::string shortform,
+                   std::string msg,
+                   void (*callback)(std::string))
+    : longform("--" + longform),
+      shortform("-" + shortform),
+      msg(msg),
+      callback((void (*)())callback)
+{
+    option_type = STRING_CALLBACK;
+}
+
+option_t::option_t(std::string longform,
+                   std::string shortform,
+                   std::string msg,
+                   std::string printval)
+    : longform("--" + longform), shortform("-" + shortform), msg(msg), printval(printval)
+{
+    option_type = STRING;
+}
  
  
  /*
   * Option List
   */
  
-option_list::option_list(std::string program_name) :
-        program_name(program_name) {
+option_list::option_list(std::string program_name) : program_name(program_name)
+{
      internal_list = std::vector<option_t>();
  }
  
  
  void option_list::add(option_t opt) { internal_list.push_back(opt); }
  
-void option_list::parse(int argc, char **argv) {
+void option_list::parse(int argc, char** argv)
+{
      for (int arg_number = 0; arg_number < argc; ++arg_number) {
          for (std::vector<option_t>::iterator this_option = internal_list.begin();
               this_option != internal_list.end();
@@ -73,74 +104,83 @@ void option_list::parse(int argc, char **argv) {
                  this_option->shortform == std::string(argv[arg_number])) {
  
                  if (present_options.count(this_option->longform) == 0) {
-                    present_options.insert(std::pair<std::string, int>(this_option->longform, 1));
+                    present_options.insert(
+                        std::pair<std::string, int>(this_option->longform, 1));
                  } else {
                      present_options[this_option->longform] += 1;
                  }
                  switch (this_option->option_type) {
-                    case VOID_CALLBACK:
-                        this_option->callback();
-                        break;
-                    case INT_CALLBACK:
-                        try {
-                            int_val = atoi(argv[++arg_number]);
-                            ((void (*)(int)) this_option->callback)(int_val);
-                        } catch (std::exception &exc) {
-                            std::cout << "An int option can only receive a number" << std::endl;
-                            throw std::exception();
-                        };
-                        break;
-                    case FLOAT_CALLBACK:
-                        try {
-                            double double_val = atof(argv[++arg_number]);
-                            ((void (*)(float)) this_option->callback)(double_val);
-                        } catch (std::exception &exc) {
-                            std::cout << "A float option can only receive a number" << std::endl;
-                            throw std::exception();
-                        };
-                        break;
-                    case BOOL_CALLBACK:
-                        try {
-                            if (arg_number == (argc - 1)) { // this is the last arg
+                case VOID_CALLBACK:
+                    this_option->callback();
+                    break;
+                case INT_CALLBACK:
+                    try {
+                        int_val = atoi(argv[++arg_number]);
+                        ((void (*)(int))this_option->callback)(int_val);
+                    } catch (std::exception& exc) {
+                        std::cout << "An int option can only receive a number"
+                                  << std::endl;
+                        throw std::exception();
+                    };
+                    break;
+                case FLOAT_CALLBACK:
+                    try {
+                        double double_val = atof(argv[++arg_number]);
+                        ((void (*)(float))this_option->callback)(double_val);
+                    } catch (std::exception& exc) {
+                        std::cout << "A float option can only receive a number"
+                                  << std::endl;
+                        throw std::exception();
+                    };
+                    break;
+                case BOOL_CALLBACK:
+                    try {
+                        if (arg_number == (argc - 1)) { // this is the last arg
+                            int_val = 1;
+                        } else { // sneak a look at the next arg since it's present
+                            char* next_arg = argv[arg_number + 1];
+                            if ((strncmp(next_arg, "-", 1) == 0) ||
+                                (strncmp(next_arg, "--", 2) == 0)) {
+                                // the next arg is actually an arg, the bool is just
+                                // present, set to true
+                                int_val = 1;
+                            } else if (strncmp(next_arg, "true", 4) == 0) {
                                  int_val = 1;
-                            } else { // sneak a look at the next arg since it's present
-                                char *next_arg = argv[arg_number + 1];
-                                if ((strncmp(next_arg, "-", 1) == 0) || (strncmp(next_arg, "--", 2) == 0)) {
-                                    // the next arg is actually an arg, the bool is just present, set to true
-                                    int_val = 1;
-                                } else if (strncmp(next_arg, "true", 4) == 0) {
-                                    int_val = 1;
-                                } else if (strncmp(next_arg, "false", 5) == 0) {
-                                    int_val = 0;
-                                } else {
-                                    // we got a number or a string.
-                                    // convert it to a number and depend on the catch to report an error condition
-                                    int_val = (bool) atoi(argv[++arg_number]);
-                                }
+                            } else if (strncmp(next_arg, "false", 5) == 0) {
+                                int_val = 0;
+                            } else {
+                                // we got a number or a string.
+                                // convert it to a number and depend on the catch to
+                                // report an error condition
+                                int_val = (bool)atoi(argv[++arg_number]);
                              }
-                        } catch (std::exception &e) {
-                            int_val = INT_MIN;
-                        };
-                        if (int_val == INT_MIN) {
-                            std::cout << "option: '" << argv[arg_number - 1] << "' -> received an unknown value. Boolean "
-                                    "options should receive one of '0', '1', 'true', 'false'." << std::endl;
-                            throw std::exception();
-                        } else if (int_val) {
-                            ((void (*)(bool)) this_option->callback)(int_val);
                          }
-                        break;
-                    case STRING_CALLBACK:
-                        try {
-                            ((void (*)(std::string)) this_option->callback)(argv[++arg_number]);
-                        } catch (std::exception &exc) {
-                            throw std::exception();
-                        };
-                    case STRING:
-                        std::cout << this_option->printval << std::endl;
-                        break;
+                    } catch (std::exception& e) {
+                        int_val = INT_MIN;
+                    };
+                    if (int_val == INT_MIN) {
+                        std::cout
+                            << "option: '" << argv[arg_number - 1]
+                            << "' -> received an unknown value. Boolean "
+                               "options should receive one of '0', '1', 'true', 'false'."
+                            << std::endl;
+                        throw std::exception();
+                    } else if (int_val) {
+                        ((void (*)(bool))this_option->callback)(int_val);
+                    }
+                    break;
+                case STRING_CALLBACK:
+                    try {
+                        ((void (*)(std::string))this_option->callback)(
+                            argv[++arg_number]);
+                    } catch (std::exception& exc) {
+                        throw std::exception();
+                    };
+                case STRING:
+                    std::cout << this_option->printval << std::endl;
+                    break;
                  }
              }
-
          }
          if (std::string("--help") == std::string(argv[arg_number]) ||
              std::string("-h") == std::string(argv[arg_number])) {
@@ -150,7 +190,8 @@ void option_list::parse(int argc, char **argv) {
      }
  }
  
-bool option_list::present(std::string option_name) {
+bool option_list::present(std::string option_name)
+{
      if (present_options.count("--" + option_name)) {
          return true;
      } else {
@@ -158,7 +199,8 @@ bool option_list::present(std::string option_name) {
      }
  }
  
-void option_list::help() {
+void option_list::help()
+{
      std::cout << program_name << std::endl;
      std::cout << "  -h [ --help ] \t\tdisplay this help message" << std::endl;
      for (std::vector<option_t>::iterator this_option = internal_list.begin();
@@ -172,14 +214,14 @@ void option_list::help() {
          }
  
          switch (help_line.size() / 8) {
-            case 0:
-                help_line += "\t";
-            case 1:
-                help_line += "\t";
-            case 2:
-                help_line += "\t";
-            case 3:
-                help_line += "\t";
+        case 0:
+            help_line += "\t";
+        case 1:
+            help_line += "\t";
+        case 2:
+            help_line += "\t";
+        case 3:
+            help_line += "\t";
          }
          help_line += this_option->msg;
          std::cout << help_line << std::endl;
diff --git a/apps/volk_option_helpers.h b/apps/volk_option_helpers.h

index 8a715476bb22acc8aa32d99980aae80cd5836d09..0756cafdcfd457c8f5db0a5a6b7e0cf866d688e1 100644 (file)
--- a/apps/volk_option_helpers.h
+++ b/apps/volk_option_helpers.h
@@ -5,56 +5,74 @@
  #ifndef VOLK_VOLK_OPTION_HELPERS_H
  #define VOLK_VOLK_OPTION_HELPERS_H
  
-#include <string>
-#include <cstring>
  #include <limits.h>
-#include <vector>
+#include <cstring>
  #include <map>
+#include <string>
+#include <vector>
  
-typedef enum
-{
-  VOID_CALLBACK,
+typedef enum {
+    VOID_CALLBACK,
      INT_CALLBACK,
      BOOL_CALLBACK,
      STRING_CALLBACK,
      FLOAT_CALLBACK,
-  STRING,
+    STRING,
  } VOLK_OPTYPE;
  
-class option_t {
-  public:
-  option_t(std::string longform, std::string shortform, std::string msg, void (*callback)());
-    option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int));
-    option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(float));
-    option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(bool));
-    option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(std::string));
-  option_t(std::string longform, std::string shortform, std::string msg, std::string printval);
-
-  std::string longform;
-  std::string shortform;
-  std::string msg;
-  VOLK_OPTYPE option_type;
-  std::string printval;
-  void (*callback)();
+class option_t
+{
+public:
+    option_t(std::string longform,
+             std::string shortform,
+             std::string msg,
+             void (*callback)());
+    option_t(std::string longform,
+             std::string shortform,
+             std::string msg,
+             void (*callback)(int));
+    option_t(std::string longform,
+             std::string shortform,
+             std::string msg,
+             void (*callback)(float));
+    option_t(std::string longform,
+             std::string shortform,
+             std::string msg,
+             void (*callback)(bool));
+    option_t(std::string longform,
+             std::string shortform,
+             std::string msg,
+             void (*callback)(std::string));
+    option_t(std::string longform,
+             std::string shortform,
+             std::string msg,
+             std::string printval);
  
+    std::string longform;
+    std::string shortform;
+    std::string msg;
+    VOLK_OPTYPE option_type;
+    std::string printval;
+    void (*callback)();
  };
  
  class option_list
  {
-  public:
-  option_list(std::string program_name);
-  bool present(std::string option_name);
+public:
+    option_list(std::string program_name);
+    bool present(std::string option_name);
+
+    void add(option_t opt);
  
-  void add(option_t opt);
+    void parse(int argc, char** argv);
  
-  void parse(int argc, char **argv);
+    void help();
  
-  void help();
-  private:
-  std::string program_name;
-  std::vector<option_t> internal_list;
-  std::map<std::string, int> present_options;
+private:
+    std::string program_name;
+    std::vector<option_t> internal_list;
+    std::map<std::string, int> present_options;
  };
  
  
-#endif //VOLK_VOLK_OPTION_HELPERS_H
+#endif // VOLK_VOLK_OPTION_HELPERS_H
diff --git a/apps/volk_profile.cc b/apps/volk_profile.cc

index 4ef5aeb71ce2b50c383211663bfe6c5a112650ea..3c2e324859995ff0525ff1f4b05d22b153ae0305 100644 (file)
--- a/apps/volk_profile.cc
+++ b/apps/volk_profile.cc
@@ -27,23 +27,23 @@
  #include <filesystem>
  #endif
  #else
-#include <boost/filesystem/operations.hpp>   // for create_directories, exists
-#include <boost/filesystem/path.hpp>         // for path, operator<<
-#include <boost/filesystem/path_traits.hpp>  // for filesystem
+#include <boost/filesystem/operations.hpp>  // for create_directories, exists
+#include <boost/filesystem/path.hpp>        // for path, operator<<
+#include <boost/filesystem/path_traits.hpp> // for filesystem
  #endif
-#include <stddef.h>                          // for size_t
-#include <sys/stat.h>                        // for stat
-#include <volk/volk_prefs.h>                 // for volk_get_config_path
-#include <iostream>                          // for operator<<, basic_ostream
-#include <fstream>                           // IWYU pragma: keep
-#include <map>                               // for map, map<>::iterator
-#include <utility>                           // for pair
-#include <vector>                            // for vector, vector<>::const_...
-
-#include "kernel_tests.h"                    // for init_test_list
-#include "qa_utils.h"                        // for volk_test_results_t, vol...
-#include "volk/volk_complex.h"               // for lv_32fc_t
-#include "volk_option_helpers.h"             // for option_list, option_t
+#include <stddef.h>          // for size_t
+#include <sys/stat.h>        // for stat
+#include <volk/volk_prefs.h> // for volk_get_config_path
+#include <fstream>           // IWYU pragma: keep
+#include <iostream>          // for operator<<, basic_ostream
+#include <map>               // for map, map<>::iterator
+#include <utility>           // for pair
+#include <vector>            // for vector, vector<>::const_...
+
+#include "kernel_tests.h"        // for init_test_list
+#include "qa_utils.h"            // for volk_test_results_t, vol...
+#include "volk/volk_complex.h"   // for lv_32fc_t
+#include "volk_option_helpers.h" // for option_list, option_t
  #include "volk_profile.h"
  
  #if HAS_STD_FILESYSTEM
@@ -72,45 +72,61 @@ void set_json(std::string val) { json_filename = val; }
  std::string volk_config_path("");
  void set_volk_config(std::string val) { volk_config_path = val; }
  
-int main(int argc, char *argv[]) {
+int main(int argc, char* argv[])
+{
  
      option_list profile_options("volk_profile");
-    profile_options.add(option_t("benchmark", "b", "Run all kernels (benchmark mode)", set_benchmark));
-    profile_options.add(option_t("tol", "t", "Set the default tolerance for all tests", set_tolerance));
-    profile_options.add(option_t("vlen", "v", "Set the default vector length for tests", set_vlen));
-    profile_options.add((option_t("iter", "i", "Set the default number of test iterations per kernel", set_iter)));
-    profile_options.add((option_t("tests-substr", "R", "Run tests matching substring", set_substr)));
-    profile_options.add((option_t("update", "u", "Run only kernels missing from config", set_update)));
-    profile_options.add((option_t("dry-run", "n", "Dry run. Respect other options, but don't write to file", set_dryrun)));
-    profile_options.add((option_t("json", "j", "Write results to JSON file named as argument value", set_json)));
-    profile_options.add((option_t("path", "p", "Specify the volk_config path", set_volk_config)));
+    profile_options.add(
+        option_t("benchmark", "b", "Run all kernels (benchmark mode)", set_benchmark));
+    profile_options.add(
+        option_t("tol", "t", "Set the default tolerance for all tests", set_tolerance));
+    profile_options.add(
+        option_t("vlen", "v", "Set the default vector length for tests", set_vlen));
+    profile_options.add((option_t(
+        "iter", "i", "Set the default number of test iterations per kernel", set_iter)));
+    profile_options.add(
+        (option_t("tests-substr", "R", "Run tests matching substring", set_substr)));
+    profile_options.add(
+        (option_t("update", "u", "Run only kernels missing from config", set_update)));
+    profile_options.add(
+        (option_t("dry-run",
+                  "n",
+                  "Dry run. Respect other options, but don't write to file",
+                  set_dryrun)));
+    profile_options.add((option_t(
+        "json", "j", "Write results to JSON file named as argument value", set_json)));
+    profile_options.add(
+        (option_t("path", "p", "Specify the volk_config path", set_volk_config)));
      profile_options.parse(argc, argv);
  
      if (profile_options.present("help")) {
          return 0;
      }
  
-    if(dry_run) {
-        std::cout << "Warning: this IS a dry-run. Config will not be written!" << std::endl;
+    if (dry_run) {
+        std::cout << "Warning: this IS a dry-run. Config will not be written!"
+                  << std::endl;
      }
  
      // Adding program options
      std::ofstream json_file;
      std::string config_file;
  
-    if ( json_filename != "" ) {
-        json_file.open( json_filename.c_str() );
+    if (json_filename != "") {
+        json_file.open(json_filename.c_str());
      }
  
-    if ( volk_config_path != "" ) {
+    if (volk_config_path != "") {
          config_file = volk_config_path + "/volk_config";
      }
  
      // Run tests
      std::vector<volk_test_results_t> results;
-    if(update_mode) {
-        if( config_file != "" ) read_results(&results, config_file);
-        else read_results(&results);
+    if (update_mode) {
+        if (config_file != "")
+            read_results(&results, config_file);
+        else
+            read_results(&results);
      }
  
      // Initialize the list of tests
@@ -118,22 +134,22 @@ int main(int argc, char *argv[]) {
  
      // Iterate through list of tests running each one
      std::string substr_to_match(test_params.kernel_regex());
-    for(unsigned int ii = 0; ii < test_cases.size(); ++ii) {
+    for (unsigned int ii = 0; ii < test_cases.size(); ++ii) {
          bool regex_match = true;
  
          volk_test_case_t test_case = test_cases[ii];
          // if the kernel name matches regex then do the test
          std::string test_case_name = test_case.name();
-        if(test_case_name.find(substr_to_match) == std::string::npos) {
+        if (test_case_name.find(substr_to_match) == std::string::npos) {
              regex_match = false;
          }
  
          // if we are in update mode check if we've already got results
          // if we have any, then no need to test that kernel
          bool update = true;
-        if(update_mode) {
-            for(unsigned int jj=0; jj < results.size(); ++jj) {
-                if(results[jj].name == test_case.name() ||
+        if (update_mode) {
+            for (unsigned int jj = 0; jj < results.size(); ++jj) {
+                if (results[jj].name == test_case.name() ||
                      results[jj].name == test_case.puppet_master_name()) {
                      update = false;
                      break;
@@ -141,39 +157,44 @@ int main(int argc, char *argv[]) {
              }
          }
  
-        if( regex_match && update ) {
+        if (regex_match && update) {
              try {
-            run_volk_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(),
-                test_case.test_parameters(), &results, test_case.puppet_master_name());
-            }
-            catch (std::string &error) {
-                std::cerr << "Caught Exception in 'run_volk_tests': " << error << std::endl;
+                run_volk_tests(test_case.desc(),
+                               test_case.kernel_ptr(),
+                               test_case.name(),
+                               test_case.test_parameters(),
+                               &results,
+                               test_case.puppet_master_name());
+            } catch (std::string& error) {
+                std::cerr << "Caught Exception in 'run_volk_tests': " << error
+                          << std::endl;
              }
          }
      }
  
  
      // Output results according to provided options
-    if(json_filename != "") {
+    if (json_filename != "") {
          write_json(json_file, results);
          json_file.close();
      }
  
-    if(!dry_run) {
-        if(config_file != "") write_results(&results, false, config_file);
-        else write_results(&results, false);
-    }
-    else {
+    if (!dry_run) {
+        if (config_file != "")
+            write_results(&results, false, config_file);
+        else
+            write_results(&results, false);
+    } else {
          std::cout << "Warning: this was a dry-run. Config not generated" << std::endl;
      }
      return 0;
  }
  
-void read_results(std::vector<volk_test_results_t> *results)
+void read_results(std::vector<volk_test_results_t>* results)
  {
      char path[1024];
      volk_get_config_path(path, true);
-    if(path[0] == 0){
+    if (path[0] == 0) {
          std::cout << "No prior test results found ..." << std::endl;
          return;
      }
@@ -181,16 +202,16 @@ void read_results(std::vector<volk_test_results_t> *results)
      read_results(results, std::string(path));
  }
  
-void read_results(std::vector<volk_test_results_t> *results, std::string path)
+void read_results(std::vector<volk_test_results_t>* results, std::string path)
  {
      struct stat buffer;
-    bool config_status = (stat (path.c_str(), &buffer) == 0);
+    bool config_status = (stat(path.c_str(), &buffer) == 0);
  
-    if( config_status ) {
+    if (config_status) {
          // a config exists and we are reading results from it
          std::ifstream config(path.c_str());
          char config_line[256];
-        while(config.getline(config_line, 255)) {
+        while (config.getline(config_line, 255)) {
              // tokenize the input line by kernel_name unaligned aligned
              // then push back in the results vector with fields filled in
  
@@ -198,26 +219,26 @@ void read_results(std::vector<volk_test_results_t> *results, std::string path)
              std::string config_str(config_line);
              std::size_t str_size = config_str.size();
              std::size_t found = config_str.find(' ');
-            
+
              // Split line by spaces
-            while(found && found < str_size) {
+            while (found && found < str_size) {
                  found = config_str.find(' ');
                  // kernel names MUST be less than 128 chars, which is
                  // a length restricted by volk/volk_prefs.c
                  // on the last token in the parsed string we won't find a space
                  // so make sure we copy at most 128 chars.
-                if(found > 127) {
+                if (found > 127) {
                      found = 127;
                  }
                  str_size = config_str.size();
-                char buffer[128] = {'\0'};
+                char buffer[128] = { '\0' };
                  config_str.copy(buffer, found + 1, 0);
                  buffer[found] = '\0';
                  single_kernel_result.push_back(std::string(buffer));
-                config_str.erase(0, found+1);
+                config_str.erase(0, found + 1);
              }
  
-            if(single_kernel_result.size() == 3) {
+            if (single_kernel_result.size() == 3) {
                  volk_test_results_t kernel_result;
                  kernel_result.name = std::string(single_kernel_result[0]);
                  kernel_result.config_name = std::string(single_kernel_result[0]);
@@ -229,45 +250,47 @@ void read_results(std::vector<volk_test_results_t> *results, std::string path)
      }
  }
  
-void write_results(const std::vector<volk_test_results_t> *results, bool update_result)
+void write_results(const std::vector<volk_test_results_t>* results, bool update_result)
  {
      char path[1024];
      volk_get_config_path(path, false);
-    if(path[0] == 0){
+    if (path[0] == 0) {
          std::cout << "Aborting 'No config save path found' ..." << std::endl;
          return;
      }
  
-    write_results( results, update_result, std::string(path));
+    write_results(results, update_result, std::string(path));
  }
  
-void write_results(const std::vector<volk_test_results_t> *results, bool update_result, const std::string path)
+void write_results(const std::vector<volk_test_results_t>* results,
+                   bool update_result,
+                   const std::string path)
  {
-//    struct stat buffer;
-//    bool config_status = (stat (path.c_str(), &buffer) == 0);
+    //    struct stat buffer;
+    //    bool config_status = (stat (path.c_str(), &buffer) == 0);
  
      /*
       * These
       */
      const fs::path config_path(path);
-    if (! fs::exists(config_path.parent_path()))
-    {
+    if (!fs::exists(config_path.parent_path())) {
          std::cout << "Creating " << config_path.parent_path() << "..." << std::endl;
          fs::create_directories(config_path.parent_path());
      }
  
      std::ofstream config;
-    if(update_result) {
+    if (update_result) {
          std::cout << "Updating " << path << "..." << std::endl;
          config.open(path.c_str(), std::ofstream::app);
-        if (!config.is_open()) { //either we don't have write access or we don't have the dir yet
+        if (!config.is_open()) { // either we don't have write access or we don't have the
+                                 // dir yet
              std::cout << "Error opening file " << path << std::endl;
          }
-    }
-    else {
+    } else {
          std::cout << "Writing " << path << "..." << std::endl;
          config.open(path.c_str());
-        if (!config.is_open()) { //either we don't have write access or we don't have the dir yet
+        if (!config.is_open()) { // either we don't have write access or we don't have the
+                                 // dir yet
              std::cout << "Error opening file " << path << std::endl;
          }
  
@@ -278,43 +301,45 @@ void write_results(const std::vector<volk_test_results_t> *results, bool update_
      }
  
      std::vector<volk_test_results_t>::const_iterator profile_results;
-    for(profile_results = results->begin(); profile_results != results->end(); ++profile_results) {
-        config << profile_results->config_name << " "
-            << profile_results->best_arch_a << " "
-            << profile_results->best_arch_u << std::endl;
+    for (profile_results = results->begin(); profile_results != results->end();
+         ++profile_results) {
+        config << profile_results->config_name << " " << profile_results->best_arch_a
+               << " " << profile_results->best_arch_u << std::endl;
      }
      config.close();
  }
  
-void write_json(std::ofstream &json_file, std::vector<volk_test_results_t> results)
+void write_json(std::ofstream& json_file, std::vector<volk_test_results_t> results)
  {
      json_file << "{" << std::endl;
      json_file << " \"volk_tests\": [" << std::endl;
      size_t len = results.size();
      size_t i = 0;
      std::vector<volk_test_results_t>::iterator result;
-    for(result = results.begin(); result != results.end(); ++result) {
+    for (result = results.begin(); result != results.end(); ++result) {
          json_file << "  {" << std::endl;
          json_file << "   \"name\": \"" << result->name << "\"," << std::endl;
          json_file << "   \"vlen\": " << (int)(result->vlen) << "," << std::endl;
          json_file << "   \"iter\": " << result->iter << "," << std::endl;
-        json_file << "   \"best_arch_a\": \"" << result->best_arch_a
-            << "\"," << std::endl;
-        json_file << "   \"best_arch_u\": \"" << result->best_arch_u
-            << "\"," << std::endl;
+        json_file << "   \"best_arch_a\": \"" << result->best_arch_a << "\","
+                  << std::endl;
+        json_file << "   \"best_arch_u\": \"" << result->best_arch_u << "\","
+                  << std::endl;
          json_file << "   \"results\": {" << std::endl;
          size_t results_len = result->results.size();
          size_t ri = 0;
  
          std::map<std::string, volk_test_time_t>::iterator kernel_time_pair;
-        for(kernel_time_pair = result->results.begin(); kernel_time_pair != result->results.end(); ++kernel_time_pair) {
+        for (kernel_time_pair = result->results.begin();
+             kernel_time_pair != result->results.end();
+             ++kernel_time_pair) {
              volk_test_time_t time = kernel_time_pair->second;
              json_file << "    \"" << time.name << "\": {" << std::endl;
              json_file << "     \"name\": \"" << time.name << "\"," << std::endl;
              json_file << "     \"time\": " << time.time << "," << std::endl;
              json_file << "     \"units\": \"" << time.units << "\"" << std::endl;
-            json_file << "    }" ;
-            if(ri+1 != results_len) {
+            json_file << "    }";
+            if (ri + 1 != results_len) {
                  json_file << ",";
              }
              json_file << std::endl;
@@ -322,7 +347,7 @@ void write_json(std::ofstream &json_file, std::vector<volk_test_results_t> resul
          }
          json_file << "   }" << std::endl;
          json_file << "  }";
-        if(i+1 != len) {
+        if (i + 1 != len) {
              json_file << ",";
          }
          json_file << std::endl;
diff --git a/apps/volk_profile.h b/apps/volk_profile.h

index 51629ab6022b348b0254f8fbbc8a032ff83eb058..ae3b474166bd48499f80cd9f6fc95d36c5671bbf 100644 (file)
--- a/apps/volk_profile.h
+++ b/apps/volk_profile.h
@@ -1,14 +1,16 @@
  
  
-#include <stdbool.h>  // for bool
-#include <iosfwd>     // for ofstream
-#include <string>     // for string
-#include <vector>     // for vector
+#include <stdbool.h> // for bool
+#include <iosfwd>    // for ofstream
+#include <string>    // for string
+#include <vector>    // for vector
  
  class volk_test_results_t;
  
-void read_results(std::vector<volk_test_results_t> *results);
-void read_results(std::vector<volk_test_results_t> *results, std::string path);
-void write_results(const std::vector<volk_test_results_t> *results, bool update_result);
-void write_results(const std::vector<volk_test_results_t> *results, bool update_result, const std::string path);
-void write_json(std::ofstream &json_file, std::vector<volk_test_results_t> results);
+void read_results(std::vector<volk_test_results_t>* results);
+void read_results(std::vector<volk_test_results_t>* results, std::string path);
+void write_results(const std::vector<volk_test_results_t>* results, bool update_result);
+void write_results(const std::vector<volk_test_results_t>* results,
+                   bool update_result,
+                   const std::string path);
+void write_json(std::ofstream& json_file, std::vector<volk_test_results_t> results);
diff --git a/cmake/msvc/config.h b/cmake/msvc/config.h

index 8b12c2a47be844675afbdf99c55f6fd0ebb7bba9..68f716e8a6b14bd9f8b53b1392dff39ec65ef427 100644 (file)
--- a/cmake/msvc/config.h
+++ b/cmake/msvc/config.h
@@ -9,7 +9,7 @@
  // enable inline functions for C code
  ////////////////////////////////////////////////////////////////////////
  #ifndef __cplusplus
-#  define inline __inline
+#define inline __inline
  #endif
  
  ////////////////////////////////////////////////////////////////////////
@@ -23,12 +23,21 @@ typedef ptrdiff_t ssize_t;
  ////////////////////////////////////////////////////////////////////////
  #if _MSC_VER < 1800
  #include <math.h>
-static inline long lrint(double x){return (long)(x > 0.0 ? x + 0.5 : x - 0.5);}
-static inline long lrintf(float x){return (long)(x > 0.0f ? x + 0.5f : x - 0.5f);}
-static inline long long llrint(double x){return (long long)(x > 0.0 ? x + 0.5 : x - 0.5);}
-static inline long long llrintf(float x){return (long long)(x > 0.0f ? x + 0.5f : x - 0.5f);}
-static inline double rint(double x){return (x > 0.0)? floor(x + 0.5) : ceil(x - 0.5);}
-static inline float rintf(float x){return (x > 0.0f)? floorf(x + 0.5f) : ceilf(x - 0.5f);}
+static inline long lrint(double x) { return (long)(x > 0.0 ? x + 0.5 : x - 0.5); }
+static inline long lrintf(float x) { return (long)(x > 0.0f ? x + 0.5f : x - 0.5f); }
+static inline long long llrint(double x)
+{
+    return (long long)(x > 0.0 ? x + 0.5 : x - 0.5);
+}
+static inline long long llrintf(float x)
+{
+    return (long long)(x > 0.0f ? x + 0.5f : x - 0.5f);
+}
+static inline double rint(double x) { return (x > 0.0) ? floor(x + 0.5) : ceil(x - 0.5); }
+static inline float rintf(float x)
+{
+    return (x > 0.0f) ? floorf(x + 0.5f) : ceilf(x - 0.5f);
+}
  #endif
  
  ////////////////////////////////////////////////////////////////////////
@@ -43,7 +52,7 @@ static inline float rintf(float x){return (x > 0.0f)? floorf(x + 0.5f) : ceilf(x
  // random and srandom
  ////////////////////////////////////////////////////////////////////////
  #include <stdlib.h>
-static inline long int random (void) { return rand(); }
-static inline void srandom (unsigned int seed) { srand(seed); }
+static inline long int random(void) { return rand(); }
+static inline void srandom(unsigned int seed) { srand(seed); }
  
  #endif // _MSC_CONFIG_H_ ]
diff --git a/cmake/msvc/sys/time.h b/cmake/msvc/sys/time.h

index aa0f5dc2fb939ced7c8266e3d73c732bf783145e..4bda1ba3a5a71dd983e14e181cb74434c4e3ec4f 100644 (file)
--- a/cmake/msvc/sys/time.h
+++ b/cmake/msvc/sys/time.h
@@ -10,67 +10,62 @@
  #define NOMINMAX
  #endif
  
-//http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668
+// http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668
  #include < time.h >
  #include <windows.h> //I've omitted this line.
  #if defined(_MSC_VER) || defined(_MSC_EXTENSIONS)
-  #define DELTA_EPOCH_IN_MICROSECS  11644473600000000Ui64
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64
  #else
-  #define DELTA_EPOCH_IN_MICROSECS  11644473600000000ULL
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
  #endif
  
  #if _MSC_VER < 1900
  struct timespec {
  
-time_t tv_sec; /* Seconds since 00:00:00 GMT, */
+    time_t tv_sec; /* Seconds since 00:00:00 GMT, */
  
-/* 1 January 1970 */
+    /* 1 January 1970 */
  
-long tv_nsec; /* Additional nanoseconds since */
-
-/* tv_sec */
+    long tv_nsec; /* Additional nanoseconds since */
  
+    /* tv_sec */
  };
  #endif
  
-struct timezone
-{
-  int  tz_minuteswest; /* minutes W of Greenwich */
-  int  tz_dsttime;     /* type of dst correction */
+struct timezone {
+    int tz_minuteswest; /* minutes W of Greenwich */
+    int tz_dsttime;     /* type of dst correction */
  };
  
-static inline int gettimeofday(struct timeval *tv, struct timezone *tz)
+static inline int gettimeofday(struct timeval* tv, struct timezone* tz)
  {
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-  {
-    GetSystemTimeAsFileTime(&ft);
-
-    tmpres |= ft.dwHighDateTime;
-    tmpres <<= 32;
-    tmpres |= ft.dwLowDateTime;
-
-    /*converting file time to unix epoch*/
-    tmpres -= DELTA_EPOCH_IN_MICROSECS;
-    tv->tv_sec = (long)(tmpres / 1000000UL);
-    tv->tv_usec = (long)(tmpres % 1000000UL);
-  }
-
-  if (NULL != tz)
-  {
-    if (!tzflag)
-    {
-      _tzset();
-      tzflag++;
+    FILETIME ft;
+    unsigned __int64 tmpres = 0;
+    static int tzflag;
+
+    if (NULL != tv) {
+        GetSystemTimeAsFileTime(&ft);
+
+        tmpres |= ft.dwHighDateTime;
+        tmpres <<= 32;
+        tmpres |= ft.dwLowDateTime;
+
+        /*converting file time to unix epoch*/
+        tmpres -= DELTA_EPOCH_IN_MICROSECS;
+        tv->tv_sec = (long)(tmpres / 1000000UL);
+        tv->tv_usec = (long)(tmpres % 1000000UL);
+    }
+
+    if (NULL != tz) {
+        if (!tzflag) {
+            _tzset();
+            tzflag++;
+        }
+        tz->tz_minuteswest = _timezone / 60;
+        tz->tz_dsttime = _daylight;
      }
-    tz->tz_minuteswest = _timezone / 60;
-    tz->tz_dsttime = _daylight;
-  }
  
-  return 0;
+    return 0;
  }
  
  #endif //_MSC_SYS_TIME_H_
diff --git a/include/volk/saturation_arithmetic.h b/include/volk/saturation_arithmetic.h

index 0886844d49a6bb62b435068db2ef6744ea88fa78..7b95ba27322a9916ae8c69d64162350c71240587 100644 (file)
--- a/include/volk/saturation_arithmetic.h
+++ b/include/volk/saturation_arithmetic.h
@@ -28,20 +28,24 @@
  
  static inline int16_t sat_adds16i(int16_t x, int16_t y)
  {
-    int32_t res = (int32_t) x + (int32_t) y;
+    int32_t res = (int32_t)x + (int32_t)y;
  
-    if (res < SHRT_MIN) res = SHRT_MIN;
-    if (res > SHRT_MAX) res = SHRT_MAX;
+    if (res < SHRT_MIN)
+        res = SHRT_MIN;
+    if (res > SHRT_MAX)
+        res = SHRT_MAX;
  
      return res;
  }
  
  static inline int16_t sat_muls16i(int16_t x, int16_t y)
  {
-    int32_t res = (int32_t) x * (int32_t) y;
+    int32_t res = (int32_t)x * (int32_t)y;
  
-    if (res < SHRT_MIN) res = SHRT_MIN;
-    if (res > SHRT_MAX) res = SHRT_MAX;
+    if (res < SHRT_MIN)
+        res = SHRT_MIN;
+    if (res > SHRT_MAX)
+        res = SHRT_MAX;
  
      return res;
  }
diff --git a/include/volk/volk_alloc.hh b/include/volk/volk_alloc.hh

index a2975da782944262aa0bed44be66a50b3e4657a8..44bcfaf14395a1c3798b1291af4625580613e9dd 100644 (file)
--- a/include/volk/volk_alloc.hh
+++ b/include/volk/volk_alloc.hh
@@ -40,30 +40,40 @@ namespace volk {
   */
  template <class T>
  struct alloc {
-  typedef T value_type;
+    typedef T value_type;
  
-  alloc() = default;
+    alloc() = default;
  
-  template <class U> constexpr alloc(alloc<U> const&) noexcept {}
+    template <class U>
+    constexpr alloc(alloc<U> const&) noexcept
+    {
+    }
  
-  T* allocate(std::size_t n) {
-    if (n > std::numeric_limits<std::size_t>::max() / sizeof(T)) throw std::bad_alloc();
+    T* allocate(std::size_t n)
+    {
+        if (n > std::numeric_limits<std::size_t>::max() / sizeof(T))
+            throw std::bad_alloc();
  
-    if (auto p = static_cast<T*>(volk_malloc(n*sizeof(T), volk_get_alignment())))
-      return p;
+        if (auto p = static_cast<T*>(volk_malloc(n * sizeof(T), volk_get_alignment())))
+            return p;
  
-    throw std::bad_alloc();
-  }
+        throw std::bad_alloc();
+    }
  
-  void deallocate(T* p, std::size_t) noexcept { volk_free(p); }
-
-} ;
+    void deallocate(T* p, std::size_t) noexcept { volk_free(p); }
+};
  
  template <class T, class U>
-bool operator==(alloc<T> const&, alloc<U> const&) { return true; }
+bool operator==(alloc<T> const&, alloc<U> const&)
+{
+    return true;
+}
  
  template <class T, class U>
-bool operator!=(alloc<T> const&, alloc<U> const&) { return false; }
+bool operator!=(alloc<T> const&, alloc<U> const&)
+{
+    return false;
+}
  
  
  /*!
@@ -73,8 +83,8 @@ bool operator!=(alloc<T> const&, alloc<U> const&) { return false; }
   * example code:
   *   volk::vector<float> v(100); // vector using volk_malloc, volk_free
   */
-template<class T>
-using vector = std::vector<T, alloc<T> >;
+template <class T>
+using vector = std::vector<T, alloc<T>>;
  
  } // namespace volk
  #endif // INCLUDED_VOLK_ALLOC_H
diff --git a/include/volk/volk_avx2_intrinsics.h b/include/volk/volk_avx2_intrinsics.h

index 17badc40b158c7f62f28db740c1898cecd1c9d68..00f3b522d5281cc7ed694f6a760347685a425ead 100644 (file)
--- a/include/volk/volk_avx2_intrinsics.h
+++ b/include/volk/volk_avx2_intrinsics.h
@@ -1,19 +1,19 @@
  /* -*- c++ -*- */
-/* 
+/*
   * Copyright 2015 Free Software Foundation, Inc.
- * 
+ *
   * This file is part of GNU Radio
- * 
+ *
   * GNU Radio is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
   * the Free Software Foundation; either version 3, or (at your option)
   * any later version.
- * 
+ *
   * GNU Radio is distributed in the hope that it will be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   * GNU General Public License for more details.
- * 
+ *
   * You should have received a copy of the GNU General Public License
   * along with GNU Radio; see the file COPYING.  If not, write to
   * the Free Software Foundation, Inc., 51 Franklin Street,
@@ -27,28 +27,59 @@
  
  #ifndef INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
  #define INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
-#include <immintrin.h>
  #include "volk/volk_avx_intrinsics.h"
+#include <immintrin.h>
  
-static inline __m256
-_mm256_polar_sign_mask_avx2(__m128i fbits){
-  const __m128i zeros = _mm_set1_epi8(0x00);
-  const __m128i sign_extract = _mm_set1_epi8(0x80);
-  const __m256i shuffle_mask = _mm256_setr_epi8(0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x01, 0xff, 0xff, 0xff, 0x02, 0xff, 0xff, 0xff, 0x03,
-                                                 0xff, 0xff, 0xff, 0x04, 0xff, 0xff, 0xff, 0x05, 0xff, 0xff, 0xff, 0x06, 0xff, 0xff, 0xff, 0x07);
-  __m256i sign_bits = _mm256_setzero_si256();
-  
-  fbits = _mm_cmpgt_epi8(fbits, zeros);
-  fbits = _mm_and_si128(fbits, sign_extract);
-  sign_bits = _mm256_insertf128_si256(sign_bits,fbits,0);
-  sign_bits = _mm256_insertf128_si256(sign_bits,fbits,1);
-  sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask);
+static inline __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
+{
+    const __m128i zeros = _mm_set1_epi8(0x00);
+    const __m128i sign_extract = _mm_set1_epi8(0x80);
+    const __m256i shuffle_mask = _mm256_setr_epi8(0xff,
+                                                  0xff,
+                                                  0xff,
+                                                  0x00,
+                                                  0xff,
+                                                  0xff,
+                                                  0xff,
+                                                  0x01,
+                                                  0xff,
+                                                  0xff,
+                                                  0xff,
+                                                  0x02,
+                                                  0xff,
+                                                  0xff,
+                                                  0xff,
+                                                  0x03,
+                                                  0xff,
+                                                  0xff,
+                                                  0xff,
+                                                  0x04,
+                                                  0xff,
+                                                  0xff,
+                                                  0xff,
+                                                  0x05,
+                                                  0xff,
+                                                  0xff,
+                                                  0xff,
+                                                  0x06,
+                                                  0xff,
+                                                  0xff,
+                                                  0xff,
+                                                  0x07);
+    __m256i sign_bits = _mm256_setzero_si256();
  
-  return _mm256_castsi256_ps(sign_bits);
+    fbits = _mm_cmpgt_epi8(fbits, zeros);
+    fbits = _mm_and_si128(fbits, sign_extract);
+    sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 0);
+    sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 1);
+    sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask);
+
+    return _mm256_castsi256_ps(sign_bits);
  }
  
  static inline __m256
-_mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits){
+_mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
+{
      // prepare sign mask for correct +-
      __m256 sign_mask = _mm256_polar_sign_mask_avx2(fbits);
  
@@ -61,26 +92,31 @@ _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits){
      return dst;
  }
  
-static inline __m256
-_mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0, const __m256 cplxValue1){
-  const __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
-  const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values
-  const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values
-  const __m256 complex_result = _mm256_hadd_ps(squared0, squared1);
-  return _mm256_permutevar8x32_ps(complex_result, idx);
+static inline __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0,
+                                                     const __m256 cplxValue1)
+{
+    const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+    const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values
+    const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values
+    const __m256 complex_result = _mm256_hadd_ps(squared0, squared1);
+    return _mm256_permutevar8x32_ps(complex_result, idx);
  }
  
-static inline __m256
-_mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar){
-  /*
-   * Calculate: |y - x|^2 * SNR_lin
-   * Consider 'symbolsX' and 'pointsX' to be complex float
-   * 'symbolsX' are 'y' and 'pointsX' are 'x'
-   */
-  const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
-  const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
-  const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1);
-  return _mm256_mul_ps(norms, scalar);
+static inline __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0,
+                                                     const __m256 symbols1,
+                                                     const __m256 points0,
+                                                     const __m256 points1,
+                                                     const __m256 scalar)
+{
+    /*
+     * Calculate: |y - x|^2 * SNR_lin
+     * Consider 'symbolsX' and 'pointsX' to be complex float
+     * 'symbolsX' are 'y' and 'pointsX' are 'x'
+     */
+    const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
+    const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
+    const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1);
+    return _mm256_mul_ps(norms, scalar);
  }
  
  #endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */
diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h

index 808799f9795a25b35deeb6e52bd1fdde6f6f56e5..bec846d562168c6159f0223cb2d558a7798a3645 100644 (file)
--- a/include/volk/volk_avx_intrinsics.h
+++ b/include/volk/volk_avx_intrinsics.h
@@ -1,19 +1,19 @@
  /* -*- c++ -*- */
-/* 
+/*
   * Copyright 2015 Free Software Foundation, Inc.
- * 
+ *
   * This file is part of GNU Radio
- * 
+ *
   * GNU Radio is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
   * the Free Software Foundation; either version 3, or (at your option)
   * any later version.
- * 
+ *
   * GNU Radio is distributed in the hope that it will be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   * GNU General Public License for more details.
- * 
+ *
   * You should have received a copy of the GNU General Public License
   * along with GNU Radio; see the file COPYING.  If not, write to
   * the Free Software Foundation, Inc., 51 Franklin Street,
@@ -29,90 +29,126 @@
  #define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
  #include <immintrin.h>
  
-static inline __m256
-_mm256_complexmul_ps(__m256 x, __m256 y)
+static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
  {
-  __m256 yl, yh, tmp1, tmp2;
-  yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
-  yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
-  tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
-  x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ...
-  tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-  return _mm256_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+    __m256 yl, yh, tmp1, tmp2;
+    yl = _mm256_moveldup_ps(y);        // Load yl with cr,cr,dr,dr ...
+    yh = _mm256_movehdup_ps(y);        // Load yh with ci,ci,di,di ...
+    tmp1 = _mm256_mul_ps(x, yl);       // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+    x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ...
+    tmp2 = _mm256_mul_ps(x, yh);       // tmp2 = ai*ci,ar*ci,bi*di,br*di
+    return _mm256_addsub_ps(tmp1,
+                            tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
  }
  
-static inline __m256
-_mm256_conjugate_ps(__m256 x){
-  const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
-  return _mm256_xor_ps(x, conjugator); // conjugate y
+static inline __m256 _mm256_conjugate_ps(__m256 x)
+{
+    const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+    return _mm256_xor_ps(x, conjugator); // conjugate y
  }
  
-static inline __m256
-_mm256_complexconjugatemul_ps(__m256 x, __m256 y){
-  y = _mm256_conjugate_ps(y);
-  return _mm256_complexmul_ps(x, y);
+static inline __m256 _mm256_complexconjugatemul_ps(__m256 x, __m256 y)
+{
+    y = _mm256_conjugate_ps(y);
+    return _mm256_complexmul_ps(x, y);
  }
  
-static inline __m256
-_mm256_normalize_ps(__m256 val)
+static inline __m256 _mm256_normalize_ps(__m256 val)
  {
-  __m256 tmp1 = _mm256_mul_ps(val, val);
-  tmp1 = _mm256_hadd_ps(tmp1, tmp1);
-  tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8
-  tmp1 = _mm256_sqrt_ps(tmp1);
-  return _mm256_div_ps(val, tmp1);
+    __m256 tmp1 = _mm256_mul_ps(val, val);
+    tmp1 = _mm256_hadd_ps(tmp1, tmp1);
+    tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8
+    tmp1 = _mm256_sqrt_ps(tmp1);
+    return _mm256_div_ps(val, tmp1);
  }
  
-static inline __m256
-_mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2){
-  __m256 complex1, complex2;
-  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
-  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
-  complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
-  complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
-  return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
+static inline __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
+{
+    __m256 complex1, complex2;
+    cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+    cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+    complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+    complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+    return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
  }
  
-static inline __m256
-_mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2){
-  return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
+static inline __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
+{
+    return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
  }
  
-static inline __m256
-_mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar){
-  /*
-   * Calculate: |y - x|^2 * SNR_lin
-   * Consider 'symbolsX' and 'pointsX' to be complex float
-   * 'symbolsX' are 'y' and 'pointsX' are 'x'
-   */
-  const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
-  const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
-  const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1);
-  return _mm256_mul_ps(norms, scalar);
+static inline __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0,
+                                                const __m256 symbols1,
+                                                const __m256 points0,
+                                                const __m256 points1,
+                                                const __m256 scalar)
+{
+    /*
+     * Calculate: |y - x|^2 * SNR_lin
+     * Consider 'symbolsX' and 'pointsX' to be complex float
+     * 'symbolsX' are 'y' and 'pointsX' are 'x'
+     */
+    const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
+    const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
+    const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1);
+    return _mm256_mul_ps(norms, scalar);
  }
  
-static inline __m256
-_mm256_polar_sign_mask(__m128i fbits){
-  __m256 sign_mask_dummy = _mm256_setzero_ps();
-  const __m128i zeros = _mm_set1_epi8(0x00);
-  const __m128i sign_extract = _mm_set1_epi8(0x80);
-  const __m128i shuffle_mask0 = _mm_setr_epi8(0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x01, 0xff, 0xff, 0xff, 0x02, 0xff, 0xff, 0xff, 0x03);
-  const __m128i shuffle_mask1 = _mm_setr_epi8(0xff, 0xff, 0xff, 0x04, 0xff, 0xff, 0xff, 0x05, 0xff, 0xff, 0xff, 0x06, 0xff, 0xff, 0xff, 0x07);
-
-  fbits = _mm_cmpgt_epi8(fbits, zeros);
-  fbits = _mm_and_si128(fbits, sign_extract);
-  __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0);
-  __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1);
-
-  __m256 sign_mask = _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0);
-  return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1);
-//  // This is the desired function call. Though it seems to be missing in GCC.
-//  // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
-//  return _mm256_set_m128(_mm_castsi128_ps(sign_bits1), _mm_castsi128_ps(sign_bits0));
+static inline __m256 _mm256_polar_sign_mask(__m128i fbits)
+{
+    __m256 sign_mask_dummy = _mm256_setzero_ps();
+    const __m128i zeros = _mm_set1_epi8(0x00);
+    const __m128i sign_extract = _mm_set1_epi8(0x80);
+    const __m128i shuffle_mask0 = _mm_setr_epi8(0xff,
+                                                0xff,
+                                                0xff,
+                                                0x00,
+                                                0xff,
+                                                0xff,
+                                                0xff,
+                                                0x01,
+                                                0xff,
+                                                0xff,
+                                                0xff,
+                                                0x02,
+                                                0xff,
+                                                0xff,
+                                                0xff,
+                                                0x03);
+    const __m128i shuffle_mask1 = _mm_setr_epi8(0xff,
+                                                0xff,
+                                                0xff,
+                                                0x04,
+                                                0xff,
+                                                0xff,
+                                                0xff,
+                                                0x05,
+                                                0xff,
+                                                0xff,
+                                                0xff,
+                                                0x06,
+                                                0xff,
+                                                0xff,
+                                                0xff,
+                                                0x07);
+
+    fbits = _mm_cmpgt_epi8(fbits, zeros);
+    fbits = _mm_and_si128(fbits, sign_extract);
+    __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0);
+    __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1);
+
+    __m256 sign_mask =
+        _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0);
+    return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1);
+    //  // This is the desired function call. Though it seems to be missing in GCC.
+    //  // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
+    //  return _mm256_set_m128(_mm_castsi128_ps(sign_bits1),
+    //  _mm_castsi128_ps(sign_bits0));
  }
  
  static inline void
-_mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1){
+_mm256_polar_deinterleave(__m256* llr0, __m256* llr1, __m256 src0, __m256 src1)
+{
      // deinterleave values
      __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
      __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
@@ -120,22 +156,25 @@ _mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1){
      *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
  }
  
-static inline __m256
-_mm256_polar_minsum_llrs(__m256 src0, __m256 src1){
+static inline __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
+{
      const __m256 sign_mask = _mm256_set1_ps(-0.0f);
-    const __m256 abs_mask = _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
+    const __m256 abs_mask =
+        _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
  
      __m256 llr0, llr1;
      _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
  
      // calculate result
-    __m256 sign = _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
-    __m256 dst = _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
+    __m256 sign =
+        _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
+    __m256 dst =
+        _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
      return _mm256_or_ps(dst, sign);
  }
  
-static inline __m256
-_mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits){
+static inline __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
+{
      // prepare sign mask for correct +-
      __m256 sign_mask = _mm256_polar_sign_mask(fbits);
  
diff --git a/include/volk/volk_common.h b/include/volk/volk_common.h

index 50ea07b4ac4c0d299b63c40fcf9ff4c5013694a5..8167d23cf0a9500fc21787aed0909518b0d2653f 100644 (file)
--- a/include/volk/volk_common.h
+++ b/include/volk/volk_common.h
@@ -18,61 +18,71 @@
  // AppleClang also defines __GNUC__, so do this check first.  These
  // will probably be the same as for __GNUC__, but let's keep them
  // separate just to be safe.
-#  define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
-#  define __VOLK_ATTR_UNUSED     __attribute__((unused))
-#  define __VOLK_ATTR_INLINE     __attribute__((always_inline))
-#  define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
-#  define __VOLK_ASM             __asm__
-#  define __VOLK_VOLATILE        __volatile__
-#  define __VOLK_ATTR_EXPORT     __attribute__((visibility("default")))
-#  define __VOLK_ATTR_IMPORT     __attribute__((visibility("default")))
-#  define __VOLK_PREFETCH(addr)  __builtin_prefetch(addr)
-#elif defined(__GNUC__)
-#  define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
-#  define __VOLK_ATTR_UNUSED     __attribute__((unused))
-#  define __VOLK_ATTR_INLINE     __attribute__((always_inline))
-#  define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
-#  define __VOLK_ASM __asm__
-#  define __VOLK_VOLATILE __volatile__
-#  if __GNUC__ >= 4
-#    define __VOLK_ATTR_EXPORT   __attribute__((visibility("default")))
-#    define __VOLK_ATTR_IMPORT   __attribute__((visibility("default")))
-#  else
-#    define __VOLK_ATTR_EXPORT
-#    define __VOLK_ATTR_IMPORT
-#  endif
-#  define __VOLK_PREFETCH(addr)  __builtin_prefetch(addr)
+#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
+#define __VOLK_ATTR_UNUSED __attribute__((unused))
+#define __VOLK_ATTR_INLINE __attribute__((always_inline))
+#define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
+#define __VOLK_ASM __asm__
+#define __VOLK_VOLATILE __volatile__
+#define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
+#define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
+#define __VOLK_PREFETCH(addr) __builtin_prefetch(addr)
+#elif defined __GNUC__
+#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
+#define __VOLK_ATTR_UNUSED __attribute__((unused))
+#define __VOLK_ATTR_INLINE __attribute__((always_inline))
+#define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
+#define __VOLK_ASM __asm__
+#define __VOLK_VOLATILE __volatile__
+#if __GNUC__ >= 4
+#define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
+#define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
  #else
-#  warning "Unknown compiler. Using default VOLK macros, which may or not work."
-#  define __VOLK_ATTR_ALIGNED(x)
-#  define __VOLK_ATTR_UNUSED
-#  define __VOLK_ATTR_INLINE
-#  define __VOLK_ATTR_DEPRECATED
-#  define __VOLK_ATTR_EXPORT
-#  define __VOLK_ATTR_IMPORT
-#  define __VOLK_PREFETCH(addr)
-#  define __VOLK_ASM __asm__
-#  define __VOLK_VOLATILE __volatile__
+#define __VOLK_ATTR_EXPORT
+#define __VOLK_ATTR_IMPORT
+#endif
+#define __VOLK_PREFETCH(addr) __builtin_prefetch(addr)
+#elif _MSC_VER
+#define __VOLK_ATTR_ALIGNED(x) __declspec(align(x))
+#define __VOLK_ATTR_UNUSED
+#define __VOLK_ATTR_INLINE __forceinline
+#define __VOLK_ATTR_DEPRECATED __declspec(deprecated)
+#define __VOLK_ATTR_EXPORT __declspec(dllexport)
+#define __VOLK_ATTR_IMPORT __declspec(dllimport)
+#define __VOLK_PREFETCH(addr)
+#define __VOLK_ASM __asm
+#define __VOLK_VOLATILE
+#else
+#define __VOLK_ATTR_ALIGNED(x)
+#define __VOLK_ATTR_UNUSED
+#define __VOLK_ATTR_INLINE
+#define __VOLK_ATTR_DEPRECATED
+#define __VOLK_ATTR_EXPORT
+#define __VOLK_ATTR_IMPORT
+#define __VOLK_PREFETCH(addr)
+#define __VOLK_ASM __asm__
+#define __VOLK_VOLATILE __volatile__
  #endif
  
  ////////////////////////////////////////////////////////////////////////
  // Ignore annoying warnings in MSVC
  ////////////////////////////////////////////////////////////////////////
  #if defined(_MSC_VER)
-#  pragma warning(disable: 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data
-#  pragma warning(disable: 4305) //'identifier' : truncation from 'type1' to 'type2'
+#pragma warning(disable : 4244) //'conversion' conversion from 'type1' to 'type2',
+                                //possible loss of data
+#pragma warning(disable : 4305) //'identifier' : truncation from 'type1' to 'type2'
  #endif
  
  ////////////////////////////////////////////////////////////////////////
  // C-linkage declaration macros
  // FIXME: due to the usage of complex.h, require gcc for c-linkage
  ////////////////////////////////////////////////////////////////////////
-#if defined(__cplusplus) && (defined(__GNUC__) || defined(__clang__))
-#  define __VOLK_DECL_BEGIN extern "C" {
-#  define __VOLK_DECL_END }
+#if defined(__cplusplus) && (__GNUC__)
+#define __VOLK_DECL_BEGIN extern "C" {
+#define __VOLK_DECL_END }
  #else
-#  define __VOLK_DECL_BEGIN
-#  define __VOLK_DECL_END
+#define __VOLK_DECL_BEGIN
+#define __VOLK_DECL_END
  #endif
  
  ////////////////////////////////////////////////////////////////////////
@@ -80,9 +90,9 @@
  // http://gcc.gnu.org/wiki/Visibility
  ////////////////////////////////////////////////////////////////////////
  #ifdef volk_EXPORTS
-#  define VOLK_API __VOLK_ATTR_EXPORT
+#define VOLK_API __VOLK_ATTR_EXPORT
  #else
-#  define VOLK_API __VOLK_ATTR_IMPORT
+#define VOLK_API __VOLK_ATTR_IMPORT
  #endif
  
  ////////////////////////////////////////////////////////////////////////
@@ -98,38 +108,38 @@
  #endif
  #endif
  
-union bit128{
-  uint8_t i8[16];
-  uint16_t i16[8];
-  uint32_t i[4];
-  float f[4];
-  double d[2];
+union bit128 {
+    uint8_t i8[16];
+    uint16_t i16[8];
+    uint32_t i[4];
+    float f[4];
+    double d[2];
  
-  #ifdef LV_HAVE_SSE
-  __m128 float_vec;
-  #endif
+#ifdef LV_HAVE_SSE
+    __m128 float_vec;
+#endif
  
-  #ifdef LV_HAVE_SSE2
-  __m128i int_vec;
-  __m128d double_vec;
-  #endif
+#ifdef LV_HAVE_SSE2
+    __m128i int_vec;
+    __m128d double_vec;
+#endif
  };
  
-union bit256{
-  uint8_t i8[32];
-  uint16_t i16[16];
-  uint32_t i[8];
-  float f[8];
-  double d[4];
+union bit256 {
+    uint8_t i8[32];
+    uint16_t i16[16];
+    uint32_t i[8];
+    float f[8];
+    double d[4];
  
-  #ifdef LV_HAVE_AVX
-  __m256 float_vec;
-  __m256i int_vec;
-  __m256d double_vec;
-  #endif
+#ifdef LV_HAVE_AVX
+    __m256 float_vec;
+    __m256i int_vec;
+    __m256d double_vec;
+#endif
  };
  
-#define bit128_p(x) ((union bit128 *)(x))
-#define bit256_p(x) ((union bit256 *)(x))
+#define bit128_p(x) ((union bit128*)(x))
+#define bit256_p(x) ((union bit256*)(x))
  
  #endif /*INCLUDED_LIBVOLK_COMMON_H*/
diff --git a/include/volk/volk_complex.h b/include/volk/volk_complex.h

index 1d61d789330c2ee9d091ce25d1ff1ed58b792ccd..ae788736d583a9abccb8bb57bfdabdf2aa3d503b 100644 (file)
--- a/include/volk/volk_complex.h
+++ b/include/volk/volk_complex.h
@@ -19,49 +19,58 @@
  
  #ifdef __cplusplus
  
-#include <complex>
  #include <stdint.h>
+#include <complex>
  
-typedef std::complex<int8_t>  lv_8sc_t;
+typedef std::complex<int8_t> lv_8sc_t;
  typedef std::complex<int16_t> lv_16sc_t;
  typedef std::complex<int32_t> lv_32sc_t;
  typedef std::complex<int64_t> lv_64sc_t;
-typedef std::complex<float>   lv_32fc_t;
-typedef std::complex<double>  lv_64fc_t;
+typedef std::complex<float> lv_32fc_t;
+typedef std::complex<double> lv_64fc_t;
  
-template <typename T> inline std::complex<T> lv_cmake(const T &r, const T &i){
+template <typename T>
+inline std::complex<T> lv_cmake(const T& r, const T& i)
+{
      return std::complex<T>(r, i);
  }
  
-template <typename T> inline typename T::value_type lv_creal(const T &x){
+template <typename T>
+inline typename T::value_type lv_creal(const T& x)
+{
      return x.real();
  }
  
-template <typename T> inline typename T::value_type lv_cimag(const T &x){
+template <typename T>
+inline typename T::value_type lv_cimag(const T& x)
+{
      return x.imag();
  }
  
-template <typename T> inline T lv_conj(const T &x){
+template <typename T>
+inline T lv_conj(const T& x)
+{
      return std::conj(x);
  }
  
  #else /* __cplusplus */
  
  #if __STDC_VERSION__ >= 199901L /* C99 check */
-/* this allows us to conj in lv_conj without the double detour for single-precision floats */
+/* this allows us to conj in lv_conj without the double detour for single-precision floats
+ */
  #include <tgmath.h>
  #endif /* C99 check */
  
  #include <complex.h>
  
-typedef char complex         lv_8sc_t;
-typedef short complex        lv_16sc_t;
-typedef long complex         lv_32sc_t;
-typedef long long complex    lv_64sc_t;
-typedef float complex        lv_32fc_t;
-typedef double complex       lv_64fc_t;
+typedef char complex lv_8sc_t;
+typedef short complex lv_16sc_t;
+typedef long complex lv_32sc_t;
+typedef long long complex lv_64sc_t;
+typedef float complex lv_32fc_t;
+typedef double complex lv_64fc_t;
  
-#define lv_cmake(r, i) ((r) + _Complex_I*(i))
+#define lv_cmake(r, i) ((r) + _Complex_I * (i))
  
  // When GNUC is available, use the complex extensions.
  // The extensions always return the correct value type.
diff --git a/include/volk/volk_malloc.h b/include/volk/volk_malloc.h

index 3477b2778c99a32230873fec962f10e26d3846ae..42ca2b000c08c6f04d7e3435ab1ffae131d28968 100644 (file)
--- a/include/volk/volk_malloc.h
+++ b/include/volk/volk_malloc.h
@@ -23,8 +23,8 @@
  #ifndef INCLUDED_VOLK_MALLOC_H
  #define INCLUDED_VOLK_MALLOC_H
  
-#include <volk/volk_common.h>
  #include <stdlib.h>
+#include <volk/volk_common.h>
  
  __VOLK_DECL_BEGIN
  
@@ -40,7 +40,8 @@ __VOLK_DECL_BEGIN
   * For Apple Clang, we fall back to `posix_memalign`.
   * see: https://linux.die.net/man/3/aligned_alloc
   * For MSVC, we fall back to `_aligned_malloc`.
- * see: https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-malloc?view=vs-2019
+ * see:
+ * https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-malloc?view=vs-2019
   *
   * Because of the ways in which volk_malloc may allocate memory, it is
   * important to always free volk_malloc pointers using volk_free.
@@ -51,7 +52,7 @@ __VOLK_DECL_BEGIN
   * \param alignment The byte alignment of the allocated memory.
   * \return pointer to aligned memory.
   */
-VOLK_API void *volk_malloc(size_t size, size_t alignment);
+VOLK_API void* volk_malloc(size_t size, size_t alignment);
  
  /*!
   * \brief Free's memory allocated by volk_malloc.
@@ -62,11 +63,12 @@ VOLK_API void *volk_malloc(size_t size, size_t alignment);
   * Thus, in this case `volk_free` inherits the same behavior `free` exhibits.
   * see: https://en.cppreference.com/w/c/memory/free
   * In case `_aligned_malloc` was used, we call `_aligned_free`.
- * see: https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-free?view=vs-2019
+ * see:
+ * https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-free?view=vs-2019
   *
   * \param aptr The aligned pointer allocated by volk_malloc.
   */
-VOLK_API void volk_free(void *aptr);
+VOLK_API void volk_free(void* aptr);
  
  __VOLK_DECL_END
  
diff --git a/include/volk/volk_neon_intrinsics.h b/include/volk/volk_neon_intrinsics.h

index 90e7b5402dca14e7fffe3ead845034a83c6c03f4..302bd30be8161081fb3d358f4e4e94ffff306e09 100644 (file)
--- a/include/volk/volk_neon_intrinsics.h
+++ b/include/volk/volk_neon_intrinsics.h
@@ -67,9 +67,9 @@
    3. This notice may not be removed or altered from any source distribution.
  
    (this is the zlib license)
- 
+
    _vsincosq_f32
- 
+
  */
  
  /*
@@ -83,13 +83,12 @@
  
  
  /* Magnitude squared for float32x4x2_t */
-static inline float32x4_t
-_vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
+static inline float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
  {
      float32x4_t iValue, qValue, result;
      iValue = vmulq_f32(cmplxValue.val[0], cmplxValue.val[0]); // Square the values
      qValue = vmulq_f32(cmplxValue.val[1], cmplxValue.val[1]); // Square the values
-    result = vaddq_f32(iValue, qValue); // Add the I2 and Q2 values
+    result = vaddq_f32(iValue, qValue);                       // Add the I2 and Q2 values
      return result;
  }
  
@@ -97,9 +96,11 @@ _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
  static inline float32x4_t _vinvsqrtq_f32(float32x4_t x)
  {
      float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
-    sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    
+    sqrt_reciprocal = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+
      return sqrt_reciprocal;
  }
  
@@ -108,19 +109,19 @@ static inline float32x4_t _vinvq_f32(float32x4_t x)
  {
      // Newton's method
      float32x4_t recip = vrecpeq_f32(x);
-    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
-    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
+    recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
+    recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
      return recip;
  }
  
  /* Complex multiplication for float32x4x2_t */
-static inline float32x4x2_t
-_vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
+static inline float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val,
+                                                    float32x4x2_t b_val)
  {
      float32x4x2_t tmp_real;
      float32x4x2_t tmp_imag;
      float32x4x2_t c_val;
-    
+
      // multiply the real*real and imag*imag to get real result
      // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
      tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
@@ -140,12 +141,12 @@ _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
  /* From ARM Compute Library, MIT license */
  static inline float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t coeffs[8])
  {
-    float32x4_t cA   = vmlaq_f32(coeffs[0], coeffs[4], x);
-    float32x4_t cB   = vmlaq_f32(coeffs[2], coeffs[6], x);
-    float32x4_t cC   = vmlaq_f32(coeffs[1], coeffs[5], x);
-    float32x4_t cD   = vmlaq_f32(coeffs[3], coeffs[7], x);
-    float32x4_t x2  = vmulq_f32(x, x);
-    float32x4_t x4  = vmulq_f32(x2, x2);
+    float32x4_t cA = vmlaq_f32(coeffs[0], coeffs[4], x);
+    float32x4_t cB = vmlaq_f32(coeffs[2], coeffs[6], x);
+    float32x4_t cC = vmlaq_f32(coeffs[1], coeffs[5], x);
+    float32x4_t cD = vmlaq_f32(coeffs[3], coeffs[7], x);
+    float32x4_t x2 = vmulq_f32(x, x);
+    float32x4_t x4 = vmulq_f32(x2, x2);
      float32x4_t res = vmlaq_f32(vmlaq_f32(cA, cB, x2), vmlaq_f32(cC, cD, x2), x4);
      return res;
  }
@@ -155,121 +156,123 @@ static inline float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t co
  static inline float32x4_t _vlogq_f32(float32x4_t x)
  {
      const float32x4_t log_tab[8] = {
-        vdupq_n_f32(-2.29561495781f),
-        vdupq_n_f32(-2.47071170807f),
-        vdupq_n_f32(-5.68692588806f),
-        vdupq_n_f32(-0.165253549814f),
-        vdupq_n_f32(5.17591238022f),
-        vdupq_n_f32(0.844007015228f),
-        vdupq_n_f32(4.58445882797f),
-        vdupq_n_f32(0.0141278216615f),
+        vdupq_n_f32(-2.29561495781f), vdupq_n_f32(-2.47071170807f),
+        vdupq_n_f32(-5.68692588806f), vdupq_n_f32(-0.165253549814f),
+        vdupq_n_f32(5.17591238022f),  vdupq_n_f32(0.844007015228f),
+        vdupq_n_f32(4.58445882797f),  vdupq_n_f32(0.0141278216615f),
      };
-    
-    const int32x4_t   CONST_127 = vdupq_n_s32(127);           // 127
+
+    const int32x4_t CONST_127 = vdupq_n_s32(127);             // 127
      const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
-    
+
      // Extract exponent
-    int32x4_t m = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
-    float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
-    
+    int32x4_t m = vsubq_s32(
+        vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
+    float32x4_t val =
+        vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
+
      // Polynomial Approximation
      float32x4_t poly = _vtaylor_polyq_f32(val, log_tab);
-    
+
      // Reconstruct
      poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
-    
+
      return poly;
  }
  
  /* Evaluation of 4 sines & cosines at once.
   * Optimized from here (zlib license)
   * http://gruntthepeon.free.fr/ssemath/ */
-static inline float32x4x2_t _vsincosq_f32(float32x4_t x) {
+static inline float32x4x2_t _vsincosq_f32(float32x4_t x)
+{
      const float32x4_t c_minus_cephes_DP1 = vdupq_n_f32(-0.78515625);
      const float32x4_t c_minus_cephes_DP2 = vdupq_n_f32(-2.4187564849853515625e-4);
      const float32x4_t c_minus_cephes_DP3 = vdupq_n_f32(-3.77489497744594108e-8);
      const float32x4_t c_sincof_p0 = vdupq_n_f32(-1.9515295891e-4);
-    const float32x4_t c_sincof_p1  = vdupq_n_f32(8.3321608736e-3);
+    const float32x4_t c_sincof_p1 = vdupq_n_f32(8.3321608736e-3);
      const float32x4_t c_sincof_p2 = vdupq_n_f32(-1.6666654611e-1);
      const float32x4_t c_coscof_p0 = vdupq_n_f32(2.443315711809948e-005);
      const float32x4_t c_coscof_p1 = vdupq_n_f32(-1.388731625493765e-003);
      const float32x4_t c_coscof_p2 = vdupq_n_f32(4.166664568298827e-002);
      const float32x4_t c_cephes_FOPI = vdupq_n_f32(1.27323954473516); // 4 / M_PI
-    
+
      const float32x4_t CONST_1 = vdupq_n_f32(1.f);
      const float32x4_t CONST_1_2 = vdupq_n_f32(0.5f);
      const float32x4_t CONST_0 = vdupq_n_f32(0.f);
-    const uint32x4_t  CONST_2 = vdupq_n_u32(2);
-    const uint32x4_t  CONST_4 = vdupq_n_u32(4);
-    
+    const uint32x4_t CONST_2 = vdupq_n_u32(2);
+    const uint32x4_t CONST_4 = vdupq_n_u32(4);
+
      uint32x4_t emm2;
-    
+
      uint32x4_t sign_mask_sin, sign_mask_cos;
      sign_mask_sin = vcltq_f32(x, CONST_0);
      x = vabsq_f32(x);
      // scale by 4/pi
      float32x4_t y = vmulq_f32(x, c_cephes_FOPI);
-    
+
      // store the integer part of y in mm0
      emm2 = vcvtq_u32_f32(y);
      /* j=(j+1) & (~1) (see the cephes sources) */
      emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
      emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
      y = vcvtq_f32_u32(emm2);
-    
+
      /* get the polynom selection mask
       there is one polynom for 0 <= x <= Pi/4
       and another one for Pi/4<x<=Pi/2
       Both branches will be computed. */
      const uint32x4_t poly_mask = vtstq_u32(emm2, CONST_2);
-    
+
      // The magic pass: "Extended precision modular arithmetic"
      x = vmlaq_f32(x, y, c_minus_cephes_DP1);
      x = vmlaq_f32(x, y, c_minus_cephes_DP2);
      x = vmlaq_f32(x, y, c_minus_cephes_DP3);
-    
+
      sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, CONST_4));
      sign_mask_cos = vtstq_u32(vsubq_u32(emm2, CONST_2), CONST_4);
-    
+
      /* Evaluate the first polynom  (0 <= x <= Pi/4) in y1,
       and the second polynom      (Pi/4 <= x <= 0) in y2 */
      float32x4_t y1, y2;
-    float32x4_t z = vmulq_f32(x,x);
-    
+    float32x4_t z = vmulq_f32(x, x);
+
      y1 = vmlaq_f32(c_coscof_p1, z, c_coscof_p0);
      y1 = vmlaq_f32(c_coscof_p2, z, y1);
      y1 = vmulq_f32(y1, z);
      y1 = vmulq_f32(y1, z);
      y1 = vmlsq_f32(y1, z, CONST_1_2);
      y1 = vaddq_f32(y1, CONST_1);
-    
+
      y2 = vmlaq_f32(c_sincof_p1, z, c_sincof_p0);
      y2 = vmlaq_f32(c_sincof_p2, z, y2);
      y2 = vmulq_f32(y2, z);
      y2 = vmlaq_f32(x, x, y2);
-    
+
      /* select the correct result from the two polynoms */
      const float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
      const float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
-    
+
      float32x4x2_t sincos;
      sincos.val[0] = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
      sincos.val[1] = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
-    
+
      return sincos;
  }
  
-static inline float32x4_t _vsinq_f32(float32x4_t x) {
+static inline float32x4_t _vsinq_f32(float32x4_t x)
+{
      const float32x4x2_t sincos = _vsincosq_f32(x);
      return sincos.val[0];
  }
  
-static inline float32x4_t _vcosq_f32(float32x4_t x) {
+static inline float32x4_t _vcosq_f32(float32x4_t x)
+{
      const float32x4x2_t sincos = _vsincosq_f32(x);
      return sincos.val[1];
  }
  
-static inline float32x4_t _vtanq_f32(float32x4_t x) {
+static inline float32x4_t _vtanq_f32(float32x4_t x)
+{
      const float32x4x2_t sincos = _vsincosq_f32(x);
      return vmulq_f32(sincos.val[0], _vinvq_f32(sincos.val[1]));
  }
diff --git a/include/volk/volk_prefs.h b/include/volk/volk_prefs.h

index cfa3806ee5ee50ed52c8f8fc1474449fb3b4d1e7..96b7f1c116c5f682f66f9b3ca0a03ef0b10f9c6e 100644 (file)
--- a/include/volk/volk_prefs.h
+++ b/include/volk/volk_prefs.h
@@ -1,17 +1,16 @@
  #ifndef INCLUDED_VOLK_PREFS_H
  #define INCLUDED_VOLK_PREFS_H
  
-#include <volk/volk_common.h>
  #include <stdbool.h>
  #include <stdlib.h>
+#include <volk/volk_common.h>
  
  __VOLK_DECL_BEGIN
  
-typedef struct volk_arch_pref
-{
-    char name[128];   //name of the kernel
-    char impl_a[128]; //best aligned impl
-    char impl_u[128]; //best unaligned impl
+typedef struct volk_arch_pref {
+    char name[128];   // name of the kernel
+    char impl_a[128]; // best aligned impl
+    char impl_u[128]; // best unaligned impl
  } volk_arch_pref_t;
  
  ////////////////////////////////////////////////////////////////////////
@@ -19,13 +18,13 @@ typedef struct volk_arch_pref
  // if config file should be tested on existence for reading.
  // returns \0 in the argument on failure.
  ////////////////////////////////////////////////////////////////////////
-VOLK_API void volk_get_config_path(char *, bool);
+VOLK_API void volk_get_config_path(char*, bool);
  
  ////////////////////////////////////////////////////////////////////////
  // load prefs into global prefs struct
  ////////////////////////////////////////////////////////////////////////
-VOLK_API size_t volk_load_preferences(volk_arch_pref_t **);
+VOLK_API size_t volk_load_preferences(volk_arch_pref_t**);
  
  __VOLK_DECL_END
  
-#endif //INCLUDED_VOLK_PREFS_H
+#endif // INCLUDED_VOLK_PREFS_H
diff --git a/include/volk/volk_sse3_intrinsics.h b/include/volk/volk_sse3_intrinsics.h

index 6b53a2a4586ab7604597f8f2c3ad250a75a0fc0d..6bdc8d81039a32f700bdb9902bf82a036e766997 100644 (file)
--- a/include/volk/volk_sse3_intrinsics.h
+++ b/include/volk/volk_sse3_intrinsics.h
@@ -1,19 +1,19 @@
  /* -*- c++ -*- */
-/* 
+/*
   * Copyright 2015 Free Software Foundation, Inc.
- * 
+ *
   * This file is part of GNU Radio
- * 
+ *
   * GNU Radio is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
   * the Free Software Foundation; either version 3, or (at your option)
   * any later version.
- * 
+ *
   * GNU Radio is distributed in the hope that it will be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   * GNU General Public License for more details.
- * 
+ *
   * You should have received a copy of the GNU General Public License
   * along with GNU Radio; see the file COPYING.  If not, write to
   * the Free Software Foundation, Inc., 51 Franklin Street,
@@ -29,49 +29,52 @@
  #define INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_
  #include <pmmintrin.h>
  
-static inline __m128
-_mm_complexmul_ps(__m128 x, __m128 y)
+static inline __m128 _mm_complexmul_ps(__m128 x, __m128 y)
  {
-  __m128 yl, yh, tmp1, tmp2;
-  yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
-  yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
-  tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-  x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
-  tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-  return _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+    __m128 yl, yh, tmp1, tmp2;
+    yl = _mm_moveldup_ps(y);        // Load yl with cr,cr,dr,dr
+    yh = _mm_movehdup_ps(y);        // Load yh with ci,ci,di,di
+    tmp1 = _mm_mul_ps(x, yl);       // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+    x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+    tmp2 = _mm_mul_ps(x, yh);       // tmp2 = ai*ci,ar*ci,bi*di,br*di
+    return _mm_addsub_ps(tmp1,
+                         tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
  }
  
-static inline __m128
-_mm_complexconjugatemul_ps(__m128 x, __m128 y)
+static inline __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
  {
-  const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
-  y = _mm_xor_ps(y, conjugator); // conjugate y
-  return _mm_complexmul_ps(x, y);
+    const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+    y = _mm_xor_ps(y, conjugator); // conjugate y
+    return _mm_complexmul_ps(x, y);
  }
  
-static inline __m128
-_mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){
-  cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
-  cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
-  return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+static inline __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
+{
+    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+    return _mm_hadd_ps(cplxValue1, cplxValue2);      // Add the I2 and Q2 values
  }
  
-static inline __m128
-_mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){
-  return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2));
+static inline __m128 _mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
+{
+    return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2));
  }
  
-static inline __m128
-_mm_scaled_norm_dist_ps_sse3(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar){
-  /*
-   * Calculate: |y - x|^2 * SNR_lin
-   * Consider 'symbolsX' and 'pointsX' to be complex float
-   * 'symbolsX' are 'y' and 'pointsX' are 'x'
-   */
-  const __m128 diff0 = _mm_sub_ps(symbols0, points0);
-  const __m128 diff1 = _mm_sub_ps(symbols1, points1);
-  const __m128 norms = _mm_magnitudesquared_ps_sse3(diff0, diff1);
-  return _mm_mul_ps(norms, scalar);
+static inline __m128 _mm_scaled_norm_dist_ps_sse3(const __m128 symbols0,
+                                                  const __m128 symbols1,
+                                                  const __m128 points0,
+                                                  const __m128 points1,
+                                                  const __m128 scalar)
+{
+    /*
+     * Calculate: |y - x|^2 * SNR_lin
+     * Consider 'symbolsX' and 'pointsX' to be complex float
+     * 'symbolsX' are 'y' and 'pointsX' are 'x'
+     */
+    const __m128 diff0 = _mm_sub_ps(symbols0, points0);
+    const __m128 diff1 = _mm_sub_ps(symbols1, points1);
+    const __m128 norms = _mm_magnitudesquared_ps_sse3(diff0, diff1);
+    return _mm_mul_ps(norms, scalar);
  }
  
  #endif /* INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ */
diff --git a/include/volk/volk_sse_intrinsics.h b/include/volk/volk_sse_intrinsics.h

index 57318e2945f27bc0f7c357222416eb1f9e691809..24fe7c169f18f0c8badae925f4c8bc0dcd8fff67 100644 (file)
--- a/include/volk/volk_sse_intrinsics.h
+++ b/include/volk/volk_sse_intrinsics.h
@@ -1,19 +1,19 @@
  /* -*- c++ -*- */
-/* 
+/*
   * Copyright 2015 Free Software Foundation, Inc.
- * 
+ *
   * This file is part of GNU Radio
- * 
+ *
   * GNU Radio is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
   * the Free Software Foundation; either version 3, or (at your option)
   * any later version.
- * 
+ *
   * GNU Radio is distributed in the hope that it will be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   * GNU General Public License for more details.
- * 
+ *
   * You should have received a copy of the GNU General Public License
   * along with GNU Radio; see the file COPYING.  If not, write to
   * the Free Software Foundation, Inc., 51 Franklin Street,
@@ -29,31 +29,34 @@
  #define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
  #include <xmmintrin.h>
  
-static inline __m128
-_mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2){
-  __m128 iValue, qValue;
-  // Arrange in i1i2i3i4 format
-  iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-  // Arrange in q1q2q3q4 format
-  qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-  iValue = _mm_mul_ps(iValue, iValue); // Square the I values
-  qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
-  return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
+{
+    __m128 iValue, qValue;
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+    // Arrange in q1q2q3q4 format
+    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
+    iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+    qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+    return _mm_add_ps(iValue, qValue);   // Add the I2 and Q2 values
  }
  
-static inline __m128
-_mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2){
-  return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2));
+static inline __m128 _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2)
+{
+    return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2));
  }
  
-static inline __m128
-_mm_scaled_norm_dist_ps_sse(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
+static inline __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0,
+                                                 const __m128 symbols1,
+                                                 const __m128 points0,
+                                                 const __m128 points1,
+                                                 const __m128 scalar)
  {
-  // calculate scalar * |x - y|^2
-  const __m128 diff0 = _mm_sub_ps(symbols0, points0);
-  const __m128 diff1 = _mm_sub_ps(symbols1, points1);
-  const __m128 norms = _mm_magnitudesquared_ps(diff0, diff1);
-  return _mm_mul_ps(norms, scalar);
+    // calculate scalar * |x - y|^2
+    const __m128 diff0 = _mm_sub_ps(symbols0, points0);
+    const __m128 diff1 = _mm_sub_ps(symbols1, points1);
+    const __m128 norms = _mm_magnitudesquared_ps(diff0, diff1);
+    return _mm_mul_ps(norms, scalar);
  }
  
  #endif /* INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ */
diff --git a/kernels/volk/volk_16i_32fc_dot_prod_32fc.h b/kernels/volk/volk_16i_32fc_dot_prod_32fc.h

index f25034030bd16f4b8194af00da073c5dfd282e27..26356499a1659ee08c25cb9ec2d80c5992841650 100644 (file)
--- a/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
+++ b/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
@@ -33,8 +33,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_16i_32fc_dot_prod_32fc(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points)
- * \endcode
+ * void volk_16i_32fc_dot_prod_32fc(lv_32fc_t* result, const short* input, const lv_32fc_t
+ * * taps, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li input: vector of shorts.
@@ -58,165 +58,178 @@
  #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
  #define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
  
-#include <volk/volk_common.h>
  #include <stdio.h>
+#include <volk/volk_common.h>
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
+static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result,
+                                                       const short* input,
+                                                       const lv_32fc_t* taps,
+                                                       unsigned int num_points)
+{
  
-  static const int N_UNROLL = 4;
+    static const int N_UNROLL = 4;
  
-  lv_32fc_t acc0 = 0;
-  lv_32fc_t acc1 = 0;
-  lv_32fc_t acc2 = 0;
-  lv_32fc_t acc3 = 0;
+    lv_32fc_t acc0 = 0;
+    lv_32fc_t acc1 = 0;
+    lv_32fc_t acc2 = 0;
+    lv_32fc_t acc3 = 0;
  
-  unsigned i = 0;
-  unsigned n = (num_points / N_UNROLL) * N_UNROLL;
+    unsigned i = 0;
+    unsigned n = (num_points / N_UNROLL) * N_UNROLL;
  
-  for(i = 0; i < n; i += N_UNROLL) {
-    acc0 += taps[i + 0] * (float)input[i + 0];
-    acc1 += taps[i + 1] * (float)input[i + 1];
-    acc2 += taps[i + 2] * (float)input[i + 2];
-    acc3 += taps[i + 3] * (float)input[i + 3];
-  }
+    for (i = 0; i < n; i += N_UNROLL) {
+        acc0 += taps[i + 0] * (float)input[i + 0];
+        acc1 += taps[i + 1] * (float)input[i + 1];
+        acc2 += taps[i + 2] * (float)input[i + 2];
+        acc3 += taps[i + 3] * (float)input[i + 3];
+    }
  
-  for(; i < num_points; i++) {
-    acc0 += taps[i] * (float)input[i];
-  }
+    for (; i < num_points; i++) {
+        acc0 += taps[i] * (float)input[i];
+    }
  
-  *result = acc0 + acc1 + acc2 + acc3;
+    *result = acc0 + acc1 + acc2 + acc3;
  }
  
  #endif /*LV_HAVE_GENERIC*/
  
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
-static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
-
-  unsigned ii;
-  unsigned quarter_points = num_points / 4;
-  lv_32fc_t* tapsPtr = (lv_32fc_t*) taps;
-  short* inputPtr = (short*) input;
-  lv_32fc_t accumulator_vec[4];
-
-  float32x4x2_t tapsVal, accumulator_val;
-  int16x4_t input16;
-  int32x4_t input32;
-  float32x4_t input_float, prod_re, prod_im;
-
-  accumulator_val.val[0] = vdupq_n_f32(0.0);
-  accumulator_val.val[1] = vdupq_n_f32(0.0);
-
-  for(ii = 0; ii < quarter_points; ++ii) {
-    tapsVal = vld2q_f32((float*)tapsPtr);
-    input16 = vld1_s16(inputPtr);
-    // widen 16-bit int to 32-bit int
-    input32 = vmovl_s16(input16);
-    // convert 32-bit int to float with scale
-    input_float = vcvtq_f32_s32(input32);
-
-    prod_re = vmulq_f32(input_float, tapsVal.val[0]);
-    prod_im = vmulq_f32(input_float, tapsVal.val[1]);
-
-    accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
-    accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
-
-    tapsPtr += 4;
-    inputPtr += 4;
-  }
-  vst2q_f32((float*)accumulator_vec, accumulator_val);
-  accumulator_vec[0] += accumulator_vec[1];
-  accumulator_vec[2] += accumulator_vec[3];
-  accumulator_vec[0] += accumulator_vec[2];
-
-  for(ii = quarter_points * 4; ii < num_points; ++ii) {
-    accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++));
-  }
-
-  *result = accumulator_vec[0];
+static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result,
+                                                    const short* input,
+                                                    const lv_32fc_t* taps,
+                                                    unsigned int num_points)
+{
+
+    unsigned ii;
+    unsigned quarter_points = num_points / 4;
+    lv_32fc_t* tapsPtr = (lv_32fc_t*)taps;
+    short* inputPtr = (short*)input;
+    lv_32fc_t accumulator_vec[4];
+
+    float32x4x2_t tapsVal, accumulator_val;
+    int16x4_t input16;
+    int32x4_t input32;
+    float32x4_t input_float, prod_re, prod_im;
+
+    accumulator_val.val[0] = vdupq_n_f32(0.0);
+    accumulator_val.val[1] = vdupq_n_f32(0.0);
+
+    for (ii = 0; ii < quarter_points; ++ii) {
+        tapsVal = vld2q_f32((float*)tapsPtr);
+        input16 = vld1_s16(inputPtr);
+        // widen 16-bit int to 32-bit int
+        input32 = vmovl_s16(input16);
+        // convert 32-bit int to float with scale
+        input_float = vcvtq_f32_s32(input32);
+
+        prod_re = vmulq_f32(input_float, tapsVal.val[0]);
+        prod_im = vmulq_f32(input_float, tapsVal.val[1]);
+
+        accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
+        accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
+
+        tapsPtr += 4;
+        inputPtr += 4;
+    }
+    vst2q_f32((float*)accumulator_vec, accumulator_val);
+    accumulator_vec[0] += accumulator_vec[1];
+    accumulator_vec[2] += accumulator_vec[3];
+    accumulator_vec[0] += accumulator_vec[2];
+
+    for (ii = quarter_points * 4; ii < num_points; ++ii) {
+        accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++));
+    }
+
+    *result = accumulator_vec[0];
  }
  
  #endif /*LV_HAVE_NEON*/
  
  #if LV_HAVE_SSE && LV_HAVE_MMX
  
-static inline void volk_16i_32fc_dot_prod_32fc_u_sse( lv_32fc_t* result, const  short* input, const  lv_32fc_t* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 8;
-
-  float res[2];
-  float *realpt = &res[0], *imagpt = &res[1];
-  const short* aPtr = input;
-  const float* bPtr = (float*)taps;
-
-  __m64  m0, m1;
-  __m128 f0, f1, f2, f3;
-  __m128 a0Val, a1Val, a2Val, a3Val;
-  __m128 b0Val, b1Val, b2Val, b3Val;
-  __m128 c0Val, c1Val, c2Val, c3Val;
-
-  __m128 dotProdVal0 = _mm_setzero_ps();
-  __m128 dotProdVal1 = _mm_setzero_ps();
-  __m128 dotProdVal2 = _mm_setzero_ps();
-  __m128 dotProdVal3 = _mm_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
-    m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
-    f0 = _mm_cvtpi16_ps(m0);
-    f1 = _mm_cvtpi16_ps(m0);
-    f2 = _mm_cvtpi16_ps(m1);
-    f3 = _mm_cvtpi16_ps(m1);
-
-    a0Val = _mm_unpacklo_ps(f0, f1);
-    a1Val = _mm_unpackhi_ps(f0, f1);
-    a2Val = _mm_unpacklo_ps(f2, f3);
-    a3Val = _mm_unpackhi_ps(f2, f3);
-
-    b0Val = _mm_loadu_ps(bPtr);
-    b1Val = _mm_loadu_ps(bPtr+4);
-    b2Val = _mm_loadu_ps(bPtr+8);
-    b3Val = _mm_loadu_ps(bPtr+12);
-
-    c0Val = _mm_mul_ps(a0Val, b0Val);
-    c1Val = _mm_mul_ps(a1Val, b1Val);
-    c2Val = _mm_mul_ps(a2Val, b2Val);
-    c3Val = _mm_mul_ps(a3Val, b3Val);
-
-    dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
-    dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
-    dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
-    dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
-
-    aPtr += 8;
-    bPtr += 16;
-  }
-
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-
-  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  *realpt = dotProductVector[0];
-  *imagpt = dotProductVector[1];
-  *realpt += dotProductVector[2];
-  *imagpt += dotProductVector[3];
-
-  number = sixteenthPoints*8;
-  for(;number < num_points; number++){
-    *realpt += ((*aPtr)   * (*bPtr++));
-    *imagpt += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = *(lv_32fc_t*)(&res[0]);
+static inline void volk_16i_32fc_dot_prod_32fc_u_sse(lv_32fc_t* result,
+                                                     const short* input,
+                                                     const lv_32fc_t* taps,
+                                                     unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 8;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const short* aPtr = input;
+    const float* bPtr = (float*)taps;
+
+    __m64 m0, m1;
+    __m128 f0, f1, f2, f3;
+    __m128 a0Val, a1Val, a2Val, a3Val;
+    __m128 b0Val, b1Val, b2Val, b3Val;
+    __m128 c0Val, c1Val, c2Val, c3Val;
+
+    __m128 dotProdVal0 = _mm_setzero_ps();
+    __m128 dotProdVal1 = _mm_setzero_ps();
+    __m128 dotProdVal2 = _mm_setzero_ps();
+    __m128 dotProdVal3 = _mm_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
+        m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
+        f0 = _mm_cvtpi16_ps(m0);
+        f1 = _mm_cvtpi16_ps(m0);
+        f2 = _mm_cvtpi16_ps(m1);
+        f3 = _mm_cvtpi16_ps(m1);
+
+        a0Val = _mm_unpacklo_ps(f0, f1);
+        a1Val = _mm_unpackhi_ps(f0, f1);
+        a2Val = _mm_unpacklo_ps(f2, f3);
+        a3Val = _mm_unpackhi_ps(f2, f3);
+
+        b0Val = _mm_loadu_ps(bPtr);
+        b1Val = _mm_loadu_ps(bPtr + 4);
+        b2Val = _mm_loadu_ps(bPtr + 8);
+        b3Val = _mm_loadu_ps(bPtr + 12);
+
+        c0Val = _mm_mul_ps(a0Val, b0Val);
+        c1Val = _mm_mul_ps(a1Val, b1Val);
+        c2Val = _mm_mul_ps(a2Val, b2Val);
+        c3Val = _mm_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 8;
+        bPtr += 16;
+    }
+
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+    _mm_store_ps(dotProductVector,
+                 dotProdVal0); // Store the results back into the dot product vector
+
+    *realpt = dotProductVector[0];
+    *imagpt = dotProductVector[1];
+    *realpt += dotProductVector[2];
+    *imagpt += dotProductVector[3];
+
+    number = sixteenthPoints * 8;
+    for (; number < num_points; number++) {
+        *realpt += ((*aPtr) * (*bPtr++));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
  }
  
  #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
@@ -224,85 +237,90 @@ static inline void volk_16i_32fc_dot_prod_32fc_u_sse( lv_32fc_t* result, const
  
  #if LV_HAVE_AVX2 && LV_HAVE_FMA
  
-static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma( lv_32fc_t* result, const  short* input, const  lv_32fc_t* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float res[2];
-  float *realpt = &res[0], *imagpt = &res[1];
-  const short* aPtr = input;
-  const float* bPtr = (float*)taps;
-
-  __m128i  m0, m1;
-  __m256i f0, f1;
-  __m256  g0, g1, h0, h1, h2, h3;
-  __m256 a0Val, a1Val, a2Val, a3Val;
-  __m256 b0Val, b1Val, b2Val, b3Val;
-
-  __m256 dotProdVal0 = _mm256_setzero_ps();
-  __m256 dotProdVal1 = _mm256_setzero_ps();
-  __m256 dotProdVal2 = _mm256_setzero_ps();
-  __m256 dotProdVal3 = _mm256_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    m0 = _mm_loadu_si128((__m128i const*) aPtr);
-    m1 = _mm_loadu_si128((__m128i const*)(aPtr+8));
-
-    f0 = _mm256_cvtepi16_epi32(m0);
-    g0 = _mm256_cvtepi32_ps(f0);
-    f1 = _mm256_cvtepi16_epi32(m1);
-    g1 = _mm256_cvtepi32_ps(f1);
-
-    h0 = _mm256_unpacklo_ps(g0, g0);
-    h1 = _mm256_unpackhi_ps(g0, g0);
-    h2 = _mm256_unpacklo_ps(g1, g1);
-    h3 = _mm256_unpackhi_ps(g1, g1);
-
-    a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
-    a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
-    a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
-    a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
-
-    b0Val = _mm256_loadu_ps(bPtr);
-    b1Val = _mm256_loadu_ps(bPtr+8);
-    b2Val = _mm256_loadu_ps(bPtr+16);
-    b3Val = _mm256_loadu_ps(bPtr+24);
-
-    dotProdVal0 = _mm256_fmadd_ps(a0Val,b0Val,dotProdVal0);
-    dotProdVal1 = _mm256_fmadd_ps(a1Val,b1Val,dotProdVal1);
-    dotProdVal2 = _mm256_fmadd_ps(a2Val,b2Val,dotProdVal2);
-    dotProdVal3 = _mm256_fmadd_ps(a3Val,b3Val,dotProdVal3);
-
-    aPtr += 16;
-    bPtr += 32;
-  }
-
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
-  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  *realpt = dotProductVector[0];
-  *imagpt = dotProductVector[1];
-  *realpt += dotProductVector[2];
-  *imagpt += dotProductVector[3];
-  *realpt += dotProductVector[4];
-  *imagpt += dotProductVector[5];
-  *realpt += dotProductVector[6];
-  *imagpt += dotProductVector[7];
-
-  number = sixteenthPoints*16;
-  for(;number < num_points; number++){
-    *realpt += ((*aPtr)   * (*bPtr++));
-    *imagpt += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = *(lv_32fc_t*)(&res[0]);
+static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
+                                                          const short* input,
+                                                          const lv_32fc_t* taps,
+                                                          unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const short* aPtr = input;
+    const float* bPtr = (float*)taps;
+
+    __m128i m0, m1;
+    __m256i f0, f1;
+    __m256 g0, g1, h0, h1, h2, h3;
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val, b1Val, b2Val, b3Val;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 dotProdVal2 = _mm256_setzero_ps();
+    __m256 dotProdVal3 = _mm256_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        m0 = _mm_loadu_si128((__m128i const*)aPtr);
+        m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
+
+        f0 = _mm256_cvtepi16_epi32(m0);
+        g0 = _mm256_cvtepi32_ps(f0);
+        f1 = _mm256_cvtepi16_epi32(m1);
+        g1 = _mm256_cvtepi32_ps(f1);
+
+        h0 = _mm256_unpacklo_ps(g0, g0);
+        h1 = _mm256_unpackhi_ps(g0, g0);
+        h2 = _mm256_unpacklo_ps(g1, g1);
+        h3 = _mm256_unpackhi_ps(g1, g1);
+
+        a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
+        a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
+        a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
+        a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
+
+        b0Val = _mm256_loadu_ps(bPtr);
+        b1Val = _mm256_loadu_ps(bPtr + 8);
+        b2Val = _mm256_loadu_ps(bPtr + 16);
+        b3Val = _mm256_loadu_ps(bPtr + 24);
+
+        dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
+        dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
+        dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
+        dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
+
+        aPtr += 16;
+        bPtr += 32;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    *realpt = dotProductVector[0];
+    *imagpt = dotProductVector[1];
+    *realpt += dotProductVector[2];
+    *imagpt += dotProductVector[3];
+    *realpt += dotProductVector[4];
+    *imagpt += dotProductVector[5];
+    *realpt += dotProductVector[6];
+    *imagpt += dotProductVector[7];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *realpt += ((*aPtr) * (*bPtr++));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
  }
  
  #endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
@@ -310,91 +328,96 @@ static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma( lv_32fc_t* result, co
  
  #ifdef LV_HAVE_AVX2
  
-static inline void volk_16i_32fc_dot_prod_32fc_u_avx2( lv_32fc_t* result, const  short* input, const  lv_32fc_t* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float res[2];
-  float *realpt = &res[0], *imagpt = &res[1];
-  const short* aPtr = input;
-  const float* bPtr = (float*)taps;
-
-  __m128i  m0, m1;
-  __m256i f0, f1;
-  __m256  g0, g1, h0, h1, h2, h3;
-  __m256 a0Val, a1Val, a2Val, a3Val;
-  __m256 b0Val, b1Val, b2Val, b3Val;
-  __m256 c0Val, c1Val, c2Val, c3Val;
-
-  __m256 dotProdVal0 = _mm256_setzero_ps();
-  __m256 dotProdVal1 = _mm256_setzero_ps();
-  __m256 dotProdVal2 = _mm256_setzero_ps();
-  __m256 dotProdVal3 = _mm256_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    m0 = _mm_loadu_si128((__m128i const*) aPtr);
-    m1 = _mm_loadu_si128((__m128i const*)(aPtr+8));
-
-    f0 = _mm256_cvtepi16_epi32(m0);
-    g0 = _mm256_cvtepi32_ps(f0);
-    f1 = _mm256_cvtepi16_epi32(m1);
-    g1 = _mm256_cvtepi32_ps(f1);
-
-    h0 = _mm256_unpacklo_ps(g0, g0);
-    h1 = _mm256_unpackhi_ps(g0, g0);
-    h2 = _mm256_unpacklo_ps(g1, g1);
-    h3 = _mm256_unpackhi_ps(g1, g1);
-
-    a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
-    a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
-    a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
-    a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
-
-    b0Val = _mm256_loadu_ps(bPtr);
-    b1Val = _mm256_loadu_ps(bPtr+8);
-    b2Val = _mm256_loadu_ps(bPtr+16);
-    b3Val = _mm256_loadu_ps(bPtr+24);
-
-    c0Val = _mm256_mul_ps(a0Val, b0Val);
-    c1Val = _mm256_mul_ps(a1Val, b1Val);
-    c2Val = _mm256_mul_ps(a2Val, b2Val);
-    c3Val = _mm256_mul_ps(a3Val, b3Val);
-
-    dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
-    dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
-    dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
-    dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
-
-    aPtr += 16;
-    bPtr += 32;
-  }
-
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
-  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  *realpt = dotProductVector[0];
-  *imagpt = dotProductVector[1];
-  *realpt += dotProductVector[2];
-  *imagpt += dotProductVector[3];
-  *realpt += dotProductVector[4];
-  *imagpt += dotProductVector[5];
-  *realpt += dotProductVector[6];
-  *imagpt += dotProductVector[7];
-
-  number = sixteenthPoints*16;
-  for(;number < num_points; number++){
-    *realpt += ((*aPtr)   * (*bPtr++));
-    *imagpt += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = *(lv_32fc_t*)(&res[0]);
+static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(lv_32fc_t* result,
+                                                      const short* input,
+                                                      const lv_32fc_t* taps,
+                                                      unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const short* aPtr = input;
+    const float* bPtr = (float*)taps;
+
+    __m128i m0, m1;
+    __m256i f0, f1;
+    __m256 g0, g1, h0, h1, h2, h3;
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val, b1Val, b2Val, b3Val;
+    __m256 c0Val, c1Val, c2Val, c3Val;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 dotProdVal2 = _mm256_setzero_ps();
+    __m256 dotProdVal3 = _mm256_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        m0 = _mm_loadu_si128((__m128i const*)aPtr);
+        m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
+
+        f0 = _mm256_cvtepi16_epi32(m0);
+        g0 = _mm256_cvtepi32_ps(f0);
+        f1 = _mm256_cvtepi16_epi32(m1);
+        g1 = _mm256_cvtepi32_ps(f1);
+
+        h0 = _mm256_unpacklo_ps(g0, g0);
+        h1 = _mm256_unpackhi_ps(g0, g0);
+        h2 = _mm256_unpacklo_ps(g1, g1);
+        h3 = _mm256_unpackhi_ps(g1, g1);
+
+        a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
+        a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
+        a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
+        a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
+
+        b0Val = _mm256_loadu_ps(bPtr);
+        b1Val = _mm256_loadu_ps(bPtr + 8);
+        b2Val = _mm256_loadu_ps(bPtr + 16);
+        b3Val = _mm256_loadu_ps(bPtr + 24);
+
+        c0Val = _mm256_mul_ps(a0Val, b0Val);
+        c1Val = _mm256_mul_ps(a1Val, b1Val);
+        c2Val = _mm256_mul_ps(a2Val, b2Val);
+        c3Val = _mm256_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 16;
+        bPtr += 32;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    *realpt = dotProductVector[0];
+    *imagpt = dotProductVector[1];
+    *realpt += dotProductVector[2];
+    *imagpt += dotProductVector[3];
+    *realpt += dotProductVector[4];
+    *imagpt += dotProductVector[5];
+    *realpt += dotProductVector[6];
+    *imagpt += dotProductVector[7];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *realpt += ((*aPtr) * (*bPtr++));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
  }
  
  #endif /*LV_HAVE_AVX2*/
@@ -403,171 +426,181 @@ static inline void volk_16i_32fc_dot_prod_32fc_u_avx2( lv_32fc_t* result, const
  #if LV_HAVE_SSE && LV_HAVE_MMX
  
  
-static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const  short* input, const  lv_32fc_t* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 8;
-
-  float res[2];
-  float *realpt = &res[0], *imagpt = &res[1];
-  const short* aPtr = input;
-  const float* bPtr = (float*)taps;
-
-  __m64  m0, m1;
-  __m128 f0, f1, f2, f3;
-  __m128 a0Val, a1Val, a2Val, a3Val;
-  __m128 b0Val, b1Val, b2Val, b3Val;
-  __m128 c0Val, c1Val, c2Val, c3Val;
-
-  __m128 dotProdVal0 = _mm_setzero_ps();
-  __m128 dotProdVal1 = _mm_setzero_ps();
-  __m128 dotProdVal2 = _mm_setzero_ps();
-  __m128 dotProdVal3 = _mm_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
-    m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
-    f0 = _mm_cvtpi16_ps(m0);
-    f1 = _mm_cvtpi16_ps(m0);
-    f2 = _mm_cvtpi16_ps(m1);
-    f3 = _mm_cvtpi16_ps(m1);
-
-    a0Val = _mm_unpacklo_ps(f0, f1);
-    a1Val = _mm_unpackhi_ps(f0, f1);
-    a2Val = _mm_unpacklo_ps(f2, f3);
-    a3Val = _mm_unpackhi_ps(f2, f3);
-
-    b0Val = _mm_load_ps(bPtr);
-    b1Val = _mm_load_ps(bPtr+4);
-    b2Val = _mm_load_ps(bPtr+8);
-    b3Val = _mm_load_ps(bPtr+12);
-
-    c0Val = _mm_mul_ps(a0Val, b0Val);
-    c1Val = _mm_mul_ps(a1Val, b1Val);
-    c2Val = _mm_mul_ps(a2Val, b2Val);
-    c3Val = _mm_mul_ps(a3Val, b3Val);
-
-    dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
-    dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
-    dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
-    dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
-
-    aPtr += 8;
-    bPtr += 16;
-  }
-
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-
-  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  *realpt = dotProductVector[0];
-  *imagpt = dotProductVector[1];
-  *realpt += dotProductVector[2];
-  *imagpt += dotProductVector[3];
-
-  number = sixteenthPoints*8;
-  for(;number < num_points; number++){
-    *realpt += ((*aPtr)   * (*bPtr++));
-    *imagpt += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = *(lv_32fc_t*)(&res[0]);
+static inline void volk_16i_32fc_dot_prod_32fc_a_sse(lv_32fc_t* result,
+                                                     const short* input,
+                                                     const lv_32fc_t* taps,
+                                                     unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 8;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const short* aPtr = input;
+    const float* bPtr = (float*)taps;
+
+    __m64 m0, m1;
+    __m128 f0, f1, f2, f3;
+    __m128 a0Val, a1Val, a2Val, a3Val;
+    __m128 b0Val, b1Val, b2Val, b3Val;
+    __m128 c0Val, c1Val, c2Val, c3Val;
+
+    __m128 dotProdVal0 = _mm_setzero_ps();
+    __m128 dotProdVal1 = _mm_setzero_ps();
+    __m128 dotProdVal2 = _mm_setzero_ps();
+    __m128 dotProdVal3 = _mm_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
+        m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
+        f0 = _mm_cvtpi16_ps(m0);
+        f1 = _mm_cvtpi16_ps(m0);
+        f2 = _mm_cvtpi16_ps(m1);
+        f3 = _mm_cvtpi16_ps(m1);
+
+        a0Val = _mm_unpacklo_ps(f0, f1);
+        a1Val = _mm_unpackhi_ps(f0, f1);
+        a2Val = _mm_unpacklo_ps(f2, f3);
+        a3Val = _mm_unpackhi_ps(f2, f3);
+
+        b0Val = _mm_load_ps(bPtr);
+        b1Val = _mm_load_ps(bPtr + 4);
+        b2Val = _mm_load_ps(bPtr + 8);
+        b3Val = _mm_load_ps(bPtr + 12);
+
+        c0Val = _mm_mul_ps(a0Val, b0Val);
+        c1Val = _mm_mul_ps(a1Val, b1Val);
+        c2Val = _mm_mul_ps(a2Val, b2Val);
+        c3Val = _mm_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 8;
+        bPtr += 16;
+    }
+
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+    _mm_store_ps(dotProductVector,
+                 dotProdVal0); // Store the results back into the dot product vector
+
+    *realpt = dotProductVector[0];
+    *imagpt = dotProductVector[1];
+    *realpt += dotProductVector[2];
+    *imagpt += dotProductVector[3];
+
+    number = sixteenthPoints * 8;
+    for (; number < num_points; number++) {
+        *realpt += ((*aPtr) * (*bPtr++));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
  }
  
  #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
  
  #ifdef LV_HAVE_AVX2
  
-static inline void volk_16i_32fc_dot_prod_32fc_a_avx2( lv_32fc_t* result, const  short* input, const  lv_32fc_t* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float res[2];
-  float *realpt = &res[0], *imagpt = &res[1];
-  const short* aPtr = input;
-  const float* bPtr = (float*)taps;
-
-  __m128i  m0, m1;
-  __m256i f0, f1;
-  __m256  g0, g1, h0, h1, h2, h3;
-  __m256 a0Val, a1Val, a2Val, a3Val;
-  __m256 b0Val, b1Val, b2Val, b3Val;
-  __m256 c0Val, c1Val, c2Val, c3Val;
-
-  __m256 dotProdVal0 = _mm256_setzero_ps();
-  __m256 dotProdVal1 = _mm256_setzero_ps();
-  __m256 dotProdVal2 = _mm256_setzero_ps();
-  __m256 dotProdVal3 = _mm256_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    m0 = _mm_load_si128((__m128i const*) aPtr);
-    m1 = _mm_load_si128((__m128i const*)(aPtr+8));
-
-    f0 = _mm256_cvtepi16_epi32(m0);
-    g0 = _mm256_cvtepi32_ps(f0);
-    f1 = _mm256_cvtepi16_epi32(m1);
-    g1 = _mm256_cvtepi32_ps(f1);
-
-    h0 = _mm256_unpacklo_ps(g0, g0);
-    h1 = _mm256_unpackhi_ps(g0, g0);
-    h2 = _mm256_unpacklo_ps(g1, g1);
-    h3 = _mm256_unpackhi_ps(g1, g1);
-
-    a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
-    a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
-    a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
-    a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
-
-    b0Val = _mm256_load_ps(bPtr);
-    b1Val = _mm256_load_ps(bPtr+8);
-    b2Val = _mm256_load_ps(bPtr+16);
-    b3Val = _mm256_load_ps(bPtr+24);
-
-    c0Val = _mm256_mul_ps(a0Val, b0Val);
-    c1Val = _mm256_mul_ps(a1Val, b1Val);
-    c2Val = _mm256_mul_ps(a2Val, b2Val);
-    c3Val = _mm256_mul_ps(a3Val, b3Val);
-
-    dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
-    dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
-    dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
-    dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
-
-    aPtr += 16;
-    bPtr += 32;
-  }
-
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
-  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  *realpt = dotProductVector[0];
-  *imagpt = dotProductVector[1];
-  *realpt += dotProductVector[2];
-  *imagpt += dotProductVector[3];
-  *realpt += dotProductVector[4];
-  *imagpt += dotProductVector[5];
-  *realpt += dotProductVector[6];
-  *imagpt += dotProductVector[7];
-
-  number = sixteenthPoints*16;
-  for(;number < num_points; number++){
-    *realpt += ((*aPtr)   * (*bPtr++));
-    *imagpt += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = *(lv_32fc_t*)(&res[0]);
+static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(lv_32fc_t* result,
+                                                      const short* input,
+                                                      const lv_32fc_t* taps,
+                                                      unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const short* aPtr = input;
+    const float* bPtr = (float*)taps;
+
+    __m128i m0, m1;
+    __m256i f0, f1;
+    __m256 g0, g1, h0, h1, h2, h3;
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val, b1Val, b2Val, b3Val;
+    __m256 c0Val, c1Val, c2Val, c3Val;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 dotProdVal2 = _mm256_setzero_ps();
+    __m256 dotProdVal3 = _mm256_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        m0 = _mm_load_si128((__m128i const*)aPtr);
+        m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
+
+        f0 = _mm256_cvtepi16_epi32(m0);
+        g0 = _mm256_cvtepi32_ps(f0);
+        f1 = _mm256_cvtepi16_epi32(m1);
+        g1 = _mm256_cvtepi32_ps(f1);
+
+        h0 = _mm256_unpacklo_ps(g0, g0);
+        h1 = _mm256_unpackhi_ps(g0, g0);
+        h2 = _mm256_unpacklo_ps(g1, g1);
+        h3 = _mm256_unpackhi_ps(g1, g1);
+
+        a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
+        a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
+        a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
+        a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
+
+        b0Val = _mm256_load_ps(bPtr);
+        b1Val = _mm256_load_ps(bPtr + 8);
+        b2Val = _mm256_load_ps(bPtr + 16);
+        b3Val = _mm256_load_ps(bPtr + 24);
+
+        c0Val = _mm256_mul_ps(a0Val, b0Val);
+        c1Val = _mm256_mul_ps(a1Val, b1Val);
+        c2Val = _mm256_mul_ps(a2Val, b2Val);
+        c3Val = _mm256_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 16;
+        bPtr += 32;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    *realpt = dotProductVector[0];
+    *imagpt = dotProductVector[1];
+    *realpt += dotProductVector[2];
+    *imagpt += dotProductVector[3];
+    *realpt += dotProductVector[4];
+    *imagpt += dotProductVector[5];
+    *realpt += dotProductVector[6];
+    *imagpt += dotProductVector[7];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *realpt += ((*aPtr) * (*bPtr++));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
  }
  
  
@@ -575,85 +608,90 @@ static inline void volk_16i_32fc_dot_prod_32fc_a_avx2( lv_32fc_t* result, const
  
  #if LV_HAVE_AVX2 && LV_HAVE_FMA
  
-static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma( lv_32fc_t* result, const  short* input, const  lv_32fc_t* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float res[2];
-  float *realpt = &res[0], *imagpt = &res[1];
-  const short* aPtr = input;
-  const float* bPtr = (float*)taps;
-
-  __m128i  m0, m1;
-  __m256i f0, f1;
-  __m256  g0, g1, h0, h1, h2, h3;
-  __m256 a0Val, a1Val, a2Val, a3Val;
-  __m256 b0Val, b1Val, b2Val, b3Val;
-
-  __m256 dotProdVal0 = _mm256_setzero_ps();
-  __m256 dotProdVal1 = _mm256_setzero_ps();
-  __m256 dotProdVal2 = _mm256_setzero_ps();
-  __m256 dotProdVal3 = _mm256_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    m0 = _mm_load_si128((__m128i const*) aPtr);
-    m1 = _mm_load_si128((__m128i const*)(aPtr+8));
-
-    f0 = _mm256_cvtepi16_epi32(m0);
-    g0 = _mm256_cvtepi32_ps(f0);
-    f1 = _mm256_cvtepi16_epi32(m1);
-    g1 = _mm256_cvtepi32_ps(f1);
-
-    h0 = _mm256_unpacklo_ps(g0, g0);
-    h1 = _mm256_unpackhi_ps(g0, g0);
-    h2 = _mm256_unpacklo_ps(g1, g1);
-    h3 = _mm256_unpackhi_ps(g1, g1);
-
-    a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
-    a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
-    a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
-    a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
-
-    b0Val = _mm256_load_ps(bPtr);
-    b1Val = _mm256_load_ps(bPtr+8);
-    b2Val = _mm256_load_ps(bPtr+16);
-    b3Val = _mm256_load_ps(bPtr+24);
-
-    dotProdVal0 = _mm256_fmadd_ps(a0Val,b0Val,dotProdVal0);
-    dotProdVal1 = _mm256_fmadd_ps(a1Val,b1Val,dotProdVal1);
-    dotProdVal2 = _mm256_fmadd_ps(a2Val,b2Val,dotProdVal2);
-    dotProdVal3 = _mm256_fmadd_ps(a3Val,b3Val,dotProdVal3);
-
-    aPtr += 16;
-    bPtr += 32;
-  }
-
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
-  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  *realpt = dotProductVector[0];
-  *imagpt = dotProductVector[1];
-  *realpt += dotProductVector[2];
-  *imagpt += dotProductVector[3];
-  *realpt += dotProductVector[4];
-  *imagpt += dotProductVector[5];
-  *realpt += dotProductVector[6];
-  *imagpt += dotProductVector[7];
-
-  number = sixteenthPoints*16;
-  for(;number < num_points; number++){
-    *realpt += ((*aPtr)   * (*bPtr++));
-    *imagpt += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = *(lv_32fc_t*)(&res[0]);
+static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
+                                                          const short* input,
+                                                          const lv_32fc_t* taps,
+                                                          unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const short* aPtr = input;
+    const float* bPtr = (float*)taps;
+
+    __m128i m0, m1;
+    __m256i f0, f1;
+    __m256 g0, g1, h0, h1, h2, h3;
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val, b1Val, b2Val, b3Val;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 dotProdVal2 = _mm256_setzero_ps();
+    __m256 dotProdVal3 = _mm256_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        m0 = _mm_load_si128((__m128i const*)aPtr);
+        m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
+
+        f0 = _mm256_cvtepi16_epi32(m0);
+        g0 = _mm256_cvtepi32_ps(f0);
+        f1 = _mm256_cvtepi16_epi32(m1);
+        g1 = _mm256_cvtepi32_ps(f1);
+
+        h0 = _mm256_unpacklo_ps(g0, g0);
+        h1 = _mm256_unpackhi_ps(g0, g0);
+        h2 = _mm256_unpacklo_ps(g1, g1);
+        h3 = _mm256_unpackhi_ps(g1, g1);
+
+        a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
+        a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
+        a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
+        a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
+
+        b0Val = _mm256_load_ps(bPtr);
+        b1Val = _mm256_load_ps(bPtr + 8);
+        b2Val = _mm256_load_ps(bPtr + 16);
+        b3Val = _mm256_load_ps(bPtr + 24);
+
+        dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
+        dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
+        dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
+        dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
+
+        aPtr += 16;
+        bPtr += 32;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    *realpt = dotProductVector[0];
+    *imagpt = dotProductVector[1];
+    *realpt += dotProductVector[2];
+    *imagpt += dotProductVector[3];
+    *realpt += dotProductVector[4];
+    *imagpt += dotProductVector[5];
+    *realpt += dotProductVector[6];
+    *imagpt += dotProductVector[7];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *realpt += ((*aPtr) * (*bPtr++));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
  }
  
  
diff --git a/kernels/volk/volk_16i_branch_4_state_8.h b/kernels/volk/volk_16i_branch_4_state_8.h

index 31b66cc948bc24a8f7b9b1242a2d93ac7ba7a383..4d00b6b61af67726e4fb080a48b5ac49a538b787 100644 (file)
--- a/kernels/volk/volk_16i_branch_4_state_8.h
+++ b/kernels/volk/volk_16i_branch_4_state_8.h
@@ -29,8 +29,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_16i_branch_4_state_8(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars)
- * \endcode
+ * void volk_16i_branch_4_state_8(short* target, short* src0, char** permuters, short*
+ * cntl2, short* cntl3, short* scalars) \endcode
   *
   * \b Inputs
   * \li src0: <FIXME>
@@ -61,155 +61,154 @@
  
  #ifdef LV_HAVE_SSSE3
  
-#include <xmmintrin.h>
  #include <emmintrin.h>
  #include <tmmintrin.h>
+#include <xmmintrin.h>
  
-static inline void
-volk_16i_branch_4_state_8_a_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars)
+static inline void volk_16i_branch_4_state_8_a_ssse3(short* target,
+                                                     short* src0,
+                                                     char** permuters,
+                                                     short* cntl2,
+                                                     short* cntl3,
+                                                     short* scalars)
  {
-  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11;
-  __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars;
+    __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11;
+    __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars;
  
-  p_target = (__m128i*)target;
-  p_src0 = (__m128i*)src0;
-  p_cntl2 = (__m128i*)cntl2;
-  p_cntl3 = (__m128i*)cntl3;
-  p_scalars = (__m128i*)scalars;
+    p_target = (__m128i*)target;
+    p_src0 = (__m128i*)src0;
+    p_cntl2 = (__m128i*)cntl2;
+    p_cntl3 = (__m128i*)cntl3;
+    p_scalars = (__m128i*)scalars;
  
-  xmm0 = _mm_load_si128(p_scalars);
+    xmm0 = _mm_load_si128(p_scalars);
  
-  xmm1 = _mm_shufflelo_epi16(xmm0, 0);
-  xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
-  xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
-  xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+    xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+    xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+    xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+    xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
  
-  xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
-  xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
-  xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
-  xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+    xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+    xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+    xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+    xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
  
-  xmm0 = _mm_load_si128((__m128i*)permuters[0]);
-  xmm6 = _mm_load_si128((__m128i*)permuters[1]);
-  xmm8 = _mm_load_si128((__m128i*)permuters[2]);
-  xmm10 = _mm_load_si128((__m128i*)permuters[3]);
+    xmm0 = _mm_load_si128((__m128i*)permuters[0]);
+    xmm6 = _mm_load_si128((__m128i*)permuters[1]);
+    xmm8 = _mm_load_si128((__m128i*)permuters[2]);
+    xmm10 = _mm_load_si128((__m128i*)permuters[3]);
  
-  xmm5 = _mm_load_si128(p_src0);
-  xmm0 = _mm_shuffle_epi8(xmm5, xmm0);
-  xmm6 = _mm_shuffle_epi8(xmm5, xmm6);
-  xmm8 = _mm_shuffle_epi8(xmm5, xmm8);
-  xmm10 = _mm_shuffle_epi8(xmm5, xmm10);
+    xmm5 = _mm_load_si128(p_src0);
+    xmm0 = _mm_shuffle_epi8(xmm5, xmm0);
+    xmm6 = _mm_shuffle_epi8(xmm5, xmm6);
+    xmm8 = _mm_shuffle_epi8(xmm5, xmm8);
+    xmm10 = _mm_shuffle_epi8(xmm5, xmm10);
  
-  xmm5 = _mm_add_epi16(xmm1, xmm2);
+    xmm5 = _mm_add_epi16(xmm1, xmm2);
  
-  xmm6 = _mm_add_epi16(xmm2, xmm6);
-  xmm8 = _mm_add_epi16(xmm1, xmm8);
+    xmm6 = _mm_add_epi16(xmm2, xmm6);
+    xmm8 = _mm_add_epi16(xmm1, xmm8);
  
-  xmm7 = _mm_load_si128(p_cntl2);
-  xmm9 = _mm_load_si128(p_cntl3);
+    xmm7 = _mm_load_si128(p_cntl2);
+    xmm9 = _mm_load_si128(p_cntl3);
  
-  xmm0 = _mm_add_epi16(xmm5, xmm0);
+    xmm0 = _mm_add_epi16(xmm5, xmm0);
  
-  xmm7 = _mm_and_si128(xmm7, xmm3);
-  xmm9 = _mm_and_si128(xmm9, xmm4);
+    xmm7 = _mm_and_si128(xmm7, xmm3);
+    xmm9 = _mm_and_si128(xmm9, xmm4);
  
-  xmm5 = _mm_load_si128(&p_cntl2[1]);
-  xmm11 = _mm_load_si128(&p_cntl3[1]);
+    xmm5 = _mm_load_si128(&p_cntl2[1]);
+    xmm11 = _mm_load_si128(&p_cntl3[1]);
  
-  xmm7 = _mm_add_epi16(xmm7, xmm9);
+    xmm7 = _mm_add_epi16(xmm7, xmm9);
  
-  xmm5 = _mm_and_si128(xmm5, xmm3);
-  xmm11 = _mm_and_si128(xmm11, xmm4);
+    xmm5 = _mm_and_si128(xmm5, xmm3);
+    xmm11 = _mm_and_si128(xmm11, xmm4);
  
-  xmm0 = _mm_add_epi16(xmm0, xmm7);
+    xmm0 = _mm_add_epi16(xmm0, xmm7);
  
  
-  xmm7 = _mm_load_si128(&p_cntl2[2]);
-  xmm9 = _mm_load_si128(&p_cntl3[2]);
+    xmm7 = _mm_load_si128(&p_cntl2[2]);
+    xmm9 = _mm_load_si128(&p_cntl3[2]);
  
-  xmm5 = _mm_add_epi16(xmm5, xmm11);
+    xmm5 = _mm_add_epi16(xmm5, xmm11);
  
-  xmm7 = _mm_and_si128(xmm7, xmm3);
-  xmm9 = _mm_and_si128(xmm9, xmm4);
+    xmm7 = _mm_and_si128(xmm7, xmm3);
+    xmm9 = _mm_and_si128(xmm9, xmm4);
  
-  xmm6 = _mm_add_epi16(xmm6, xmm5);
+    xmm6 = _mm_add_epi16(xmm6, xmm5);
  
  
-  xmm5 = _mm_load_si128(&p_cntl2[3]);
-  xmm11 = _mm_load_si128(&p_cntl3[3]);
+    xmm5 = _mm_load_si128(&p_cntl2[3]);
+    xmm11 = _mm_load_si128(&p_cntl3[3]);
  
-  xmm7 = _mm_add_epi16(xmm7, xmm9);
+    xmm7 = _mm_add_epi16(xmm7, xmm9);
  
-  xmm5 = _mm_and_si128(xmm5, xmm3);
-  xmm11 = _mm_and_si128(xmm11, xmm4);
+    xmm5 = _mm_and_si128(xmm5, xmm3);
+    xmm11 = _mm_and_si128(xmm11, xmm4);
  
-  xmm8 = _mm_add_epi16(xmm8, xmm7);
+    xmm8 = _mm_add_epi16(xmm8, xmm7);
  
-  xmm5 = _mm_add_epi16(xmm5, xmm11);
+    xmm5 = _mm_add_epi16(xmm5, xmm11);
  
-  _mm_store_si128(p_target, xmm0);
-  _mm_store_si128(&p_target[1], xmm6);
+    _mm_store_si128(p_target, xmm0);
+    _mm_store_si128(&p_target[1], xmm6);
  
-  xmm10 = _mm_add_epi16(xmm5, xmm10);
+    xmm10 = _mm_add_epi16(xmm5, xmm10);
  
-  _mm_store_si128(&p_target[2], xmm8);
+    _mm_store_si128(&p_target[2], xmm8);
  
-  _mm_store_si128(&p_target[3], xmm10);
+    _mm_store_si128(&p_target[3], xmm10);
  }
  
  
  #endif /*LV_HAVE_SSEs*/
  
  #ifdef LV_HAVE_GENERIC
-static inline  void
-volk_16i_branch_4_state_8_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars)
+static inline void volk_16i_branch_4_state_8_generic(short* target,
+                                                     short* src0,
+                                                     char** permuters,
+                                                     short* cntl2,
+                                                     short* cntl3,
+                                                     short* scalars)
  {
-  int i = 0;
-
-  int bound = 4;
-
-  for(; i < bound; ++i) {
-    target[i* 8] = src0[((char)permuters[i][0])/2]
-      + ((i + 1)%2  * scalars[0])
-      + (((i >> 1)^1) * scalars[1])
-      + (cntl2[i * 8] & scalars[2])
-      + (cntl3[i * 8] & scalars[3]);
-    target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2]
-      + ((i + 1)%2  * scalars[0])
-      + (((i >> 1)^1) * scalars[1])
-      + (cntl2[i * 8 + 1] & scalars[2])
-      + (cntl3[i * 8 + 1] & scalars[3]);
-    target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2]
-      + ((i + 1)%2  * scalars[0])
-      + (((i >> 1)^1) * scalars[1])
-      + (cntl2[i * 8 + 2] & scalars[2])
-      + (cntl3[i * 8 + 2] & scalars[3]);
-    target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2]
-      + ((i + 1)%2  * scalars[0])
-      + (((i >> 1)^1) * scalars[1])
-      + (cntl2[i * 8 + 3] & scalars[2])
-      + (cntl3[i * 8 + 3] & scalars[3]);
-    target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2]
-      + ((i + 1)%2  * scalars[0])
-      + (((i >> 1)^1) * scalars[1])
-      + (cntl2[i * 8 + 4] & scalars[2])
-      + (cntl3[i * 8 + 4] & scalars[3]);
-    target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2]
-      + ((i + 1)%2  * scalars[0])
-      + (((i >> 1)^1) * scalars[1])
-      + (cntl2[i * 8 + 5] & scalars[2])
-      + (cntl3[i * 8 + 5] & scalars[3]);
-    target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2]
-      + ((i + 1)%2  * scalars[0])
-      + (((i >> 1)^1) * scalars[1])
-      + (cntl2[i * 8 + 6] & scalars[2])
-      + (cntl3[i * 8 + 6] & scalars[3]);
-    target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2]
-      + ((i + 1)%2  * scalars[0])
-      + (((i >> 1)^1) * scalars[1])
-      + (cntl2[i * 8 + 7] & scalars[2])
-      + (cntl3[i * 8 + 7] & scalars[3]);
-  }
+    int i = 0;
+
+    int bound = 4;
+
+    for (; i < bound; ++i) {
+        target[i * 8] = src0[((char)permuters[i][0]) / 2] + ((i + 1) % 2 * scalars[0]) +
+                        (((i >> 1) ^ 1) * scalars[1]) + (cntl2[i * 8] & scalars[2]) +
+                        (cntl3[i * 8] & scalars[3]);
+        target[i * 8 + 1] = src0[((char)permuters[i][1 * 2]) / 2] +
+                            ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
+                            (cntl2[i * 8 + 1] & scalars[2]) +
+                            (cntl3[i * 8 + 1] & scalars[3]);
+        target[i * 8 + 2] = src0[((char)permuters[i][2 * 2]) / 2] +
+                            ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
+                            (cntl2[i * 8 + 2] & scalars[2]) +
+                            (cntl3[i * 8 + 2] & scalars[3]);
+        target[i * 8 + 3] = src0[((char)permuters[i][3 * 2]) / 2] +
+                            ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
+                            (cntl2[i * 8 + 3] & scalars[2]) +
+                            (cntl3[i * 8 + 3] & scalars[3]);
+        target[i * 8 + 4] = src0[((char)permuters[i][4 * 2]) / 2] +
+                            ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
+                            (cntl2[i * 8 + 4] & scalars[2]) +
+                            (cntl3[i * 8 + 4] & scalars[3]);
+        target[i * 8 + 5] = src0[((char)permuters[i][5 * 2]) / 2] +
+                            ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
+                            (cntl2[i * 8 + 5] & scalars[2]) +
+                            (cntl3[i * 8 + 5] & scalars[3]);
+        target[i * 8 + 6] = src0[((char)permuters[i][6 * 2]) / 2] +
+                            ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
+                            (cntl2[i * 8 + 6] & scalars[2]) +
+                            (cntl3[i * 8 + 6] & scalars[3]);
+        target[i * 8 + 7] = src0[((char)permuters[i][7 * 2]) / 2] +
+                            ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
+                            (cntl2[i * 8 + 7] & scalars[2]) +
+                            (cntl3[i * 8 + 7] & scalars[3]);
+    }
  }
  
  #endif /*LV_HAVE_GENERIC*/
diff --git a/kernels/volk/volk_16i_convert_8i.h b/kernels/volk/volk_16i_convert_8i.h

index e2f953b777838d30ff77702dc14391c6ed84bc78..f09515de70ea36a08e0ecd33c7094b29cb352e5b 100644 (file)
--- a/kernels/volk/volk_16i_convert_8i.h
+++ b/kernels/volk/volk_16i_convert_8i.h
@@ -29,8 +29,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_16i_convert_8i(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
- * \endcode
+ * void volk_16i_convert_8i(int8_t* outputVector, const int16_t* inputVector, unsigned int
+ * num_points) \endcode
   *
   * \b Inputs
   * \li inputVector: The input vector of 16-bit shorts.
@@ -59,39 +59,42 @@
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_16i_convert_8i_u_avx2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
+static inline void volk_16i_convert_8i_u_avx2(int8_t* outputVector,
+                                              const int16_t* inputVector,
+                                              unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int thirtysecondPoints = num_points / 32;
+    unsigned int number = 0;
+    const unsigned int thirtysecondPoints = num_points / 32;
  
-  int8_t* outputVectorPtr = outputVector;
-  int16_t* inputPtr = (int16_t*)inputVector;
-  __m256i inputVal1;
-  __m256i inputVal2;
-  __m256i ret;
+    int8_t* outputVectorPtr = outputVector;
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m256i inputVal1;
+    __m256i inputVal2;
+    __m256i ret;
  
-  for(;number < thirtysecondPoints; number++){
+    for (; number < thirtysecondPoints; number++) {
  
-    // Load the 16 values
-    inputVal1 = _mm256_loadu_si256((__m256i*)inputPtr); inputPtr += 16;
-    inputVal2 = _mm256_loadu_si256((__m256i*)inputPtr); inputPtr += 16;
+        // Load the 16 values
+        inputVal1 = _mm256_loadu_si256((__m256i*)inputPtr);
+        inputPtr += 16;
+        inputVal2 = _mm256_loadu_si256((__m256i*)inputPtr);
+        inputPtr += 16;
  
-    inputVal1 = _mm256_srai_epi16(inputVal1, 8);
-    inputVal2 = _mm256_srai_epi16(inputVal2, 8);
+        inputVal1 = _mm256_srai_epi16(inputVal1, 8);
+        inputVal2 = _mm256_srai_epi16(inputVal2, 8);
  
-    ret = _mm256_packs_epi16(inputVal1, inputVal2);
-    ret = _mm256_permute4x64_epi64(ret, 0b11011000);
+        ret = _mm256_packs_epi16(inputVal1, inputVal2);
+        ret = _mm256_permute4x64_epi64(ret, 0b11011000);
  
-    _mm256_storeu_si256((__m256i*)outputVectorPtr, ret);
+        _mm256_storeu_si256((__m256i*)outputVectorPtr, ret);
  
-    outputVectorPtr += 32;
-  }
+        outputVectorPtr += 32;
+    }
  
-  number = thirtysecondPoints * 32;
-  for(; number < num_points; number++){
-    outputVector[number] =(int8_t)(inputVector[number] >> 8);
-  }
+    number = thirtysecondPoints * 32;
+    for (; number < num_points; number++) {
+        outputVector[number] = (int8_t)(inputVector[number] >> 8);
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -99,60 +102,62 @@ volk_16i_convert_8i_u_avx2(int8_t* outputVector, const int16_t* inputVector, uns
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void
-volk_16i_convert_8i_u_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
+static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector,
+                                              const int16_t* inputVector,
+                                              unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-  int8_t* outputVectorPtr = outputVector;
-  int16_t* inputPtr = (int16_t*)inputVector;
-  __m128i inputVal1;
-  __m128i inputVal2;
-  __m128i ret;
+    int8_t* outputVectorPtr = outputVector;
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal1;
+    __m128i inputVal2;
+    __m128i ret;
  
-  for(;number < sixteenthPoints; number++){
+    for (; number < sixteenthPoints; number++) {
  
-    // Load the 16 values
-    inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
-    inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
+        // Load the 16 values
+        inputVal1 = _mm_loadu_si128((__m128i*)inputPtr);
+        inputPtr += 8;
+        inputVal2 = _mm_loadu_si128((__m128i*)inputPtr);
+        inputPtr += 8;
  
-    inputVal1 = _mm_srai_epi16(inputVal1, 8);
-    inputVal2 = _mm_srai_epi16(inputVal2, 8);
+        inputVal1 = _mm_srai_epi16(inputVal1, 8);
+        inputVal2 = _mm_srai_epi16(inputVal2, 8);
  
-    ret = _mm_packs_epi16(inputVal1, inputVal2);
+        ret = _mm_packs_epi16(inputVal1, inputVal2);
  
-    _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
+        _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
  
-    outputVectorPtr += 16;
-  }
+        outputVectorPtr += 16;
+    }
  
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    outputVector[number] =(int8_t)(inputVector[number] >> 8);
-  }
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = (int8_t)(inputVector[number] >> 8);
+    }
  }
  #endif /* LV_HAVE_SSE2 */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_16i_convert_8i_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
+static inline void volk_16i_convert_8i_generic(int8_t* outputVector,
+                                               const int16_t* inputVector,
+                                               unsigned int num_points)
  {
-  int8_t* outputVectorPtr = outputVector;
-  const int16_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
+    int8_t* outputVectorPtr = outputVector;
+    const int16_t* inputVectorPtr = inputVector;
+    unsigned int number = 0;
  
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++  >> 8));
-  }
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
-
-
  #endif /* INCLUDED_volk_16i_convert_8i_u_H */
  #ifndef INCLUDED_volk_16i_convert_8i_a_H
  #define INCLUDED_volk_16i_convert_8i_a_H
@@ -163,39 +168,42 @@ volk_16i_convert_8i_generic(int8_t* outputVector, const int16_t* inputVector, un
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_16i_convert_8i_a_avx2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
+static inline void volk_16i_convert_8i_a_avx2(int8_t* outputVector,
+                                              const int16_t* inputVector,
+                                              unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int thirtysecondPoints = num_points / 32;
+    unsigned int number = 0;
+    const unsigned int thirtysecondPoints = num_points / 32;
  
-  int8_t* outputVectorPtr = outputVector;
-  int16_t* inputPtr = (int16_t*)inputVector;
-  __m256i inputVal1;
-  __m256i inputVal2;
-  __m256i ret;
+    int8_t* outputVectorPtr = outputVector;
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m256i inputVal1;
+    __m256i inputVal2;
+    __m256i ret;
  
-  for(;number < thirtysecondPoints; number++){
+    for (; number < thirtysecondPoints; number++) {
  
-    // Load the 16 values
-    inputVal1 = _mm256_load_si256((__m256i*)inputPtr); inputPtr += 16;
-    inputVal2 = _mm256_load_si256((__m256i*)inputPtr); inputPtr += 16;
+        // Load the 16 values
+        inputVal1 = _mm256_load_si256((__m256i*)inputPtr);
+        inputPtr += 16;
+        inputVal2 = _mm256_load_si256((__m256i*)inputPtr);
+        inputPtr += 16;
  
-    inputVal1 = _mm256_srai_epi16(inputVal1, 8);
-    inputVal2 = _mm256_srai_epi16(inputVal2, 8);
+        inputVal1 = _mm256_srai_epi16(inputVal1, 8);
+        inputVal2 = _mm256_srai_epi16(inputVal2, 8);
  
-    ret = _mm256_packs_epi16(inputVal1, inputVal2);
-    ret = _mm256_permute4x64_epi64(ret, 0b11011000);
+        ret = _mm256_packs_epi16(inputVal1, inputVal2);
+        ret = _mm256_permute4x64_epi64(ret, 0b11011000);
  
-    _mm256_store_si256((__m256i*)outputVectorPtr, ret);
+        _mm256_store_si256((__m256i*)outputVectorPtr, ret);
  
-    outputVectorPtr += 32;
-  }
+        outputVectorPtr += 32;
+    }
  
-  number = thirtysecondPoints * 32;
-  for(; number < num_points; number++){
-    outputVector[number] =(int8_t)(inputVector[number] >> 8);
-  }
+    number = thirtysecondPoints * 32;
+    for (; number < num_points; number++) {
+        outputVector[number] = (int8_t)(inputVector[number] >> 8);
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -203,38 +211,41 @@ volk_16i_convert_8i_a_avx2(int8_t* outputVector, const int16_t* inputVector, uns
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void
-volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
+static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector,
+                                              const int16_t* inputVector,
+                                              unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-  int8_t* outputVectorPtr = outputVector;
-  int16_t* inputPtr = (int16_t*)inputVector;
-  __m128i inputVal1;
-  __m128i inputVal2;
-  __m128i ret;
+    int8_t* outputVectorPtr = outputVector;
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal1;
+    __m128i inputVal2;
+    __m128i ret;
  
-  for(;number < sixteenthPoints; number++){
+    for (; number < sixteenthPoints; number++) {
  
-    // Load the 16 values
-    inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
-    inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+        // Load the 16 values
+        inputVal1 = _mm_load_si128((__m128i*)inputPtr);
+        inputPtr += 8;
+        inputVal2 = _mm_load_si128((__m128i*)inputPtr);
+        inputPtr += 8;
  
-    inputVal1 = _mm_srai_epi16(inputVal1, 8);
-    inputVal2 = _mm_srai_epi16(inputVal2, 8);
+        inputVal1 = _mm_srai_epi16(inputVal1, 8);
+        inputVal2 = _mm_srai_epi16(inputVal2, 8);
  
-    ret = _mm_packs_epi16(inputVal1, inputVal2);
+        ret = _mm_packs_epi16(inputVal1, inputVal2);
  
-    _mm_store_si128((__m128i*)outputVectorPtr, ret);
+        _mm_store_si128((__m128i*)outputVectorPtr, ret);
  
-    outputVectorPtr += 16;
-  }
+        outputVectorPtr += 16;
+    }
  
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    outputVector[number] =(int8_t)(inputVector[number] >> 8);
-  }
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = (int8_t)(inputVector[number] >> 8);
+    }
  }
  #endif /* LV_HAVE_SSE2 */
  
@@ -242,53 +253,55 @@ volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, uns
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_16i_convert_8i_neon(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
+static inline void volk_16i_convert_8i_neon(int8_t* outputVector,
+                                            const int16_t* inputVector,
+                                            unsigned int num_points)
  {
-  int8_t* outputVectorPtr = outputVector;
-  const int16_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  unsigned int sixteenth_points = num_points / 16;
-
-  int16x8_t inputVal0;
-  int16x8_t inputVal1;
-  int8x8_t outputVal0;
-  int8x8_t outputVal1;
-  int8x16_t outputVal;
-
-  for(number = 0; number < sixteenth_points; number++){
-    // load two input vectors
-    inputVal0 = vld1q_s16(inputVectorPtr);
-    inputVal1 = vld1q_s16(inputVectorPtr+8);
-    // shift right
-    outputVal0 = vshrn_n_s16(inputVal0, 8);
-    outputVal1 = vshrn_n_s16(inputVal1, 8);
-    // squash two vectors and write output
-    outputVal = vcombine_s8(outputVal0, outputVal1);
-    vst1q_s8(outputVectorPtr, outputVal);
-    inputVectorPtr += 16;
-    outputVectorPtr += 16;
-  }
-
-  for(number = sixteenth_points * 16; number < num_points; number++){
-    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
-  }
+    int8_t* outputVectorPtr = outputVector;
+    const int16_t* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+    unsigned int sixteenth_points = num_points / 16;
+
+    int16x8_t inputVal0;
+    int16x8_t inputVal1;
+    int8x8_t outputVal0;
+    int8x8_t outputVal1;
+    int8x16_t outputVal;
+
+    for (number = 0; number < sixteenth_points; number++) {
+        // load two input vectors
+        inputVal0 = vld1q_s16(inputVectorPtr);
+        inputVal1 = vld1q_s16(inputVectorPtr + 8);
+        // shift right
+        outputVal0 = vshrn_n_s16(inputVal0, 8);
+        outputVal1 = vshrn_n_s16(inputVal1, 8);
+        // squash two vectors and write output
+        outputVal = vcombine_s8(outputVal0, outputVal1);
+        vst1q_s8(outputVectorPtr, outputVal);
+        inputVectorPtr += 16;
+        outputVectorPtr += 16;
+    }
+
+    for (number = sixteenth_points * 16; number < num_points; number++) {
+        *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+    }
  }
  #endif /* LV_HAVE_NEON */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_16i_convert_8i_a_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
+static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector,
+                                                 const int16_t* inputVector,
+                                                 unsigned int num_points)
  {
-  int8_t* outputVectorPtr = outputVector;
-  const int16_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
+    int8_t* outputVectorPtr = outputVector;
+    const int16_t* inputVectorPtr = inputVector;
+    unsigned int number = 0;
  
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
-  }
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
diff --git a/kernels/volk/volk_16i_max_star_16i.h b/kernels/volk/volk_16i_max_star_16i.h

index 78fd9110c44d0ffdb33ee05cc016b241a2a4d47e..d5dad1848e023f7abfdef75a2d0ef59a061ed7d3 100644 (file)
--- a/kernels/volk/volk_16i_max_star_16i.h
+++ b/kernels/volk/volk_16i_max_star_16i.h
@@ -53,67 +53,69 @@
  #ifndef INCLUDED_volk_16i_max_star_16i_a_H
  #define INCLUDED_volk_16i_max_star_16i_a_H
  
-#include<inttypes.h>
-#include<stdio.h>
+#include <inttypes.h>
+#include <stdio.h>
  
  #ifdef LV_HAVE_SSSE3
  
-#include<xmmintrin.h>
-#include<emmintrin.h>
-#include<tmmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <xmmintrin.h>
  
  static inline void
  volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_points)
  {
-  const unsigned int num_bytes = num_points*2;
+    const unsigned int num_bytes = num_points * 2;
  
-  short candidate = src0[0];
-  short cands[8];
-  __m128i xmm0, xmm1, xmm3, xmm4, xmm5, xmm6;
+    short candidate = src0[0];
+    short cands[8];
+    __m128i xmm0, xmm1, xmm3, xmm4, xmm5, xmm6;
  
-  __m128i *p_src0;
+    __m128i* p_src0;
  
-  p_src0 = (__m128i*)src0;
+    p_src0 = (__m128i*)src0;
  
-  int bound = num_bytes >> 4;
-  int leftovers = (num_bytes >> 1) & 7;
+    int bound = num_bytes >> 4;
+    int leftovers = (num_bytes >> 1) & 7;
  
-  int i = 0;
+    int i = 0;
  
-  xmm1 = _mm_setzero_si128();
-  xmm0 = _mm_setzero_si128();
-  //_mm_insert_epi16(xmm0, candidate, 0);
+    xmm1 = _mm_setzero_si128();
+    xmm0 = _mm_setzero_si128();
+    //_mm_insert_epi16(xmm0, candidate, 0);
  
-  xmm0 = _mm_shuffle_epi8(xmm0, xmm1);
+    xmm0 = _mm_shuffle_epi8(xmm0, xmm1);
  
-  for(i = 0; i < bound; ++i) {
-    xmm1 = _mm_load_si128(p_src0);
-    p_src0 += 1;
-    //xmm2 = _mm_sub_epi16(xmm1, xmm0);
+    for (i = 0; i < bound; ++i) {
+        xmm1 = _mm_load_si128(p_src0);
+        p_src0 += 1;
+        // xmm2 = _mm_sub_epi16(xmm1, xmm0);
  
-    xmm3 = _mm_cmpgt_epi16(xmm0, xmm1);
-    xmm4 = _mm_cmpeq_epi16(xmm0, xmm1);
-    xmm5 = _mm_cmpgt_epi16(xmm1, xmm0);
+        xmm3 = _mm_cmpgt_epi16(xmm0, xmm1);
+        xmm4 = _mm_cmpeq_epi16(xmm0, xmm1);
+        xmm5 = _mm_cmpgt_epi16(xmm1, xmm0);
  
-    xmm6 = _mm_xor_si128(xmm4, xmm5);
+        xmm6 = _mm_xor_si128(xmm4, xmm5);
  
-    xmm3 = _mm_and_si128(xmm3, xmm0);
-    xmm4 = _mm_and_si128(xmm6, xmm1);
+        xmm3 = _mm_and_si128(xmm3, xmm0);
+        xmm4 = _mm_and_si128(xmm6, xmm1);
  
-    xmm0 = _mm_add_epi16(xmm3, xmm4);
-  }
+        xmm0 = _mm_add_epi16(xmm3, xmm4);
+    }
  
-  _mm_store_si128((__m128i*)cands, xmm0);
+    _mm_store_si128((__m128i*)cands, xmm0);
  
-  for(i = 0; i < 8; ++i) {
-    candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i];
-  }
+    for (i = 0; i < 8; ++i) {
+        candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i];
+    }
  
-  for(i = 0; i < leftovers; ++i) {
-    candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i];
-  }
+    for (i = 0; i < leftovers; ++i) {
+        candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0)
+                        ? candidate
+                        : src0[(bound << 3) + i];
+    }
  
-  target[0] = candidate;
+    target[0] = candidate;
  }
  
  #endif /*LV_HAVE_SSSE3*/
@@ -124,38 +126,38 @@ volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_point
  static inline void
  volk_16i_max_star_16i_neon(short* target, short* src0, unsigned int num_points)
  {
-  const unsigned int eighth_points = num_points / 8;
-  unsigned number;
-  int16x8_t input_vec;
-  int16x8_t diff, zeros;
-  uint16x8_t comp1, comp2;
-  zeros = vdupq_n_s16(0);
-
-  int16x8x2_t tmpvec;
-
-  int16x8_t candidate_vec = vld1q_dup_s16(src0 );
-  short candidate;
-  ++src0;
-
-  for(number=0; number < eighth_points; ++number) {
-    input_vec = vld1q_s16(src0);
-    __VOLK_PREFETCH(src0+16);
-    diff = vsubq_s16(candidate_vec, input_vec);
-    comp1 = vcgeq_s16(diff, zeros);
-    comp2 = vcltq_s16(diff, zeros);
-
-    tmpvec.val[0] = vandq_s16(candidate_vec, (int16x8_t)comp1);
-    tmpvec.val[1] = vandq_s16(input_vec, (int16x8_t)comp2);
-
-    candidate_vec = vaddq_s16(tmpvec.val[0], tmpvec.val[1]);
-    src0 += 8;
-  }
-  vst1q_s16(&candidate, candidate_vec);
-
-  for(number=0; number < num_points%8; number++) {
-    candidate = ((int16_t)(candidate - src0[number]) > 0) ? candidate : src0[number];
-  }
-  target[0] = candidate;
+    const unsigned int eighth_points = num_points / 8;
+    unsigned number;
+    int16x8_t input_vec;
+    int16x8_t diff, zeros;
+    uint16x8_t comp1, comp2;
+    zeros = vdupq_n_s16(0);
+
+    int16x8x2_t tmpvec;
+
+    int16x8_t candidate_vec = vld1q_dup_s16(src0);
+    short candidate;
+    ++src0;
+
+    for (number = 0; number < eighth_points; ++number) {
+        input_vec = vld1q_s16(src0);
+        __VOLK_PREFETCH(src0 + 16);
+        diff = vsubq_s16(candidate_vec, input_vec);
+        comp1 = vcgeq_s16(diff, zeros);
+        comp2 = vcltq_s16(diff, zeros);
+
+        tmpvec.val[0] = vandq_s16(candidate_vec, (int16x8_t)comp1);
+        tmpvec.val[1] = vandq_s16(input_vec, (int16x8_t)comp2);
+
+        candidate_vec = vaddq_s16(tmpvec.val[0], tmpvec.val[1]);
+        src0 += 8;
+    }
+    vst1q_s16(&candidate, candidate_vec);
+
+    for (number = 0; number < num_points % 8; number++) {
+        candidate = ((int16_t)(candidate - src0[number]) > 0) ? candidate : src0[number];
+    }
+    target[0] = candidate;
  }
  #endif /*LV_HAVE_NEON*/
  
@@ -164,17 +166,17 @@ volk_16i_max_star_16i_neon(short* target, short* src0, unsigned int num_points)
  static inline void
  volk_16i_max_star_16i_generic(short* target, short* src0, unsigned int num_points)
  {
-  const unsigned int num_bytes = num_points*2;
+    const unsigned int num_bytes = num_points * 2;
  
-  int i = 0;
+    int i = 0;
  
-  int bound = num_bytes >> 1;
+    int bound = num_bytes >> 1;
  
-  short candidate = src0[0];
-  for(i = 1; i < bound; ++i) {
-    candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i];
-  }
-  target[0] = candidate;
+    short candidate = src0[0];
+    for (i = 1; i < bound; ++i) {
+        candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i];
+    }
+    target[0] = candidate;
  }
  
  #endif /*LV_HAVE_GENERIC*/
diff --git a/kernels/volk/volk_16i_max_star_horizontal_16i.h b/kernels/volk/volk_16i_max_star_horizontal_16i.h

index 4ffe264d924a454bd4e8480473b35aac64286274..2e1f52b6f947f9230ed107cff126b2aafcfa9945 100644 (file)
--- a/kernels/volk/volk_16i_max_star_horizontal_16i.h
+++ b/kernels/volk/volk_16i_max_star_horizontal_16i.h
@@ -29,8 +29,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_16i_max_star_horizontal_16i(short* target, short* src0, unsigned int num_points);
- * \endcode
+ * void volk_16i_max_star_horizontal_16i(short* target, short* src0, unsigned int
+ * num_points); \endcode
   *
   * \b Inputs
   * \li src0: The input vector.
@@ -55,102 +55,113 @@
  
  #include <volk/volk_common.h>
  
-#include<inttypes.h>
-#include<stdio.h>
+#include <inttypes.h>
+#include <stdio.h>
  
  
  #ifdef LV_HAVE_SSSE3
  
-#include<xmmintrin.h>
-#include<emmintrin.h>
-#include<tmmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <xmmintrin.h>
  
-static inline void
-volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigned int num_points)
+static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target,
+                                                            int16_t* src0,
+                                                            unsigned int num_points)
  {
-  const unsigned int num_bytes = num_points*2;
+    const unsigned int num_bytes = num_points * 2;
  
-  static const uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff,
-                                        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-  static const uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
-                                        0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d};
-  static const uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00,
-                                       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
-  static const uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
-                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02};
+    static const uint8_t shufmask0[16] = {
+        0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d,
+        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+    };
+    static const uint8_t shufmask1[16] = {
+        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+        0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d
+    };
+    static const uint8_t andmask0[16] = {
+        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    };
+    static const uint8_t andmask1[16] = {
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02
+    };
  
-  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
-  __m128i  xmm5, xmm6, xmm7, xmm8;
+    __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+    __m128i xmm5, xmm6, xmm7, xmm8;
  
-  xmm4 = _mm_load_si128((__m128i*)shufmask0);
-  xmm5 = _mm_load_si128((__m128i*)shufmask1);
-  xmm6 = _mm_load_si128((__m128i*)andmask0);
-  xmm7 = _mm_load_si128((__m128i*)andmask1);
+    xmm4 = _mm_load_si128((__m128i*)shufmask0);
+    xmm5 = _mm_load_si128((__m128i*)shufmask1);
+    xmm6 = _mm_load_si128((__m128i*)andmask0);
+    xmm7 = _mm_load_si128((__m128i*)andmask1);
  
-  __m128i *p_target, *p_src0;
+    __m128i *p_target, *p_src0;
  
-  p_target = (__m128i*)target;
-  p_src0 = (__m128i*)src0;
+    p_target = (__m128i*)target;
+    p_src0 = (__m128i*)src0;
  
-  int bound = num_bytes >> 5;
-  int intermediate = (num_bytes >> 4) & 1;
-  int leftovers = (num_bytes >> 1) & 7;
+    int bound = num_bytes >> 5;
+    int intermediate = (num_bytes >> 4) & 1;
+    int leftovers = (num_bytes >> 1) & 7;
  
-  int i = 0;
+    int i = 0;
  
-  for(i = 0; i < bound; ++i) {
-    xmm0 = _mm_load_si128(p_src0);
-    xmm1 = _mm_load_si128(&p_src0[1]);
+    for (i = 0; i < bound; ++i) {
+        xmm0 = _mm_load_si128(p_src0);
+        xmm1 = _mm_load_si128(&p_src0[1]);
  
-    xmm2 = _mm_xor_si128(xmm2, xmm2);
-    p_src0 += 2;
+        xmm2 = _mm_xor_si128(xmm2, xmm2);
+        p_src0 += 2;
  
-    xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+        xmm3 = _mm_hsub_epi16(xmm0, xmm1);
  
-    xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
+        xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
  
-    xmm8 = _mm_and_si128(xmm2, xmm6);
-    xmm3 = _mm_and_si128(xmm2, xmm7);
+        xmm8 = _mm_and_si128(xmm2, xmm6);
+        xmm3 = _mm_and_si128(xmm2, xmm7);
  
  
-    xmm8 = _mm_add_epi8(xmm8, xmm4);
-    xmm3 = _mm_add_epi8(xmm3, xmm5);
+        xmm8 = _mm_add_epi8(xmm8, xmm4);
+        xmm3 = _mm_add_epi8(xmm3, xmm5);
  
-    xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
-    xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
+        xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
+        xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
  
  
-    xmm3 = _mm_add_epi16(xmm0, xmm1);
+        xmm3 = _mm_add_epi16(xmm0, xmm1);
  
  
-    _mm_store_si128(p_target, xmm3);
+        _mm_store_si128(p_target, xmm3);
  
-    p_target += 1;
-  }
+        p_target += 1;
+    }
  
-  if (intermediate) {
-    xmm0 = _mm_load_si128(p_src0);
+    if (intermediate) {
+        xmm0 = _mm_load_si128(p_src0);
  
-    xmm2 = _mm_xor_si128(xmm2, xmm2);
-    p_src0 += 1;
+        xmm2 = _mm_xor_si128(xmm2, xmm2);
+        p_src0 += 1;
  
-    xmm3 = _mm_hsub_epi16(xmm0, xmm1);
-    xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
+        xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+        xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
  
-    xmm8 = _mm_and_si128(xmm2, xmm6);
+        xmm8 = _mm_and_si128(xmm2, xmm6);
  
-    xmm3 = _mm_add_epi8(xmm8, xmm4);
+        xmm3 = _mm_add_epi8(xmm8, xmm4);
  
-    xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
+        xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
  
-    _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec);
+        _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec);
  
-    p_target = (__m128i*)((int8_t*)p_target + 8);
-  }
+        p_target = (__m128i*)((int8_t*)p_target + 8);
+    }
  
-  for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) {
-    target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
-  }
+    for (i = (bound << 4) + (intermediate << 3);
+         i < (bound << 4) + (intermediate << 3) + leftovers;
+         i += 2) {
+        target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
+    }
  }
  
  #endif /*LV_HAVE_SSSE3*/
@@ -158,54 +169,59 @@ volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigne
  #ifdef LV_HAVE_NEON
  
  #include <arm_neon.h>
-static inline void
-volk_16i_max_star_horizontal_16i_neon(int16_t* target, int16_t* src0, unsigned int num_points)
+static inline void volk_16i_max_star_horizontal_16i_neon(int16_t* target,
+                                                         int16_t* src0,
+                                                         unsigned int num_points)
  {
-  const unsigned int eighth_points = num_points / 16;
-  unsigned number;
-  int16x8x2_t input_vec;
-  int16x8_t diff, max_vec, zeros;
-  uint16x8_t comp1, comp2;
-  zeros = vdupq_n_s16(0);
-  for(number=0; number < eighth_points; ++number) {
-    input_vec = vld2q_s16(src0);
-    //__VOLK_PREFETCH(src0+16);
-    diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
-    comp1 = vcgeq_s16(diff, zeros);
-    comp2 = vcltq_s16(diff, zeros);
-
-    input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1);
-    input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2);
-
-    max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]);
-    vst1q_s16(target, max_vec);
-    src0 += 16;
-    target += 8;
-  }
-  for(number=0; number < num_points%16; number+=2) {
-    target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0) ? src0[number] : src0[number+1];
-  }
-
+    const unsigned int eighth_points = num_points / 16;
+    unsigned number;
+    int16x8x2_t input_vec;
+    int16x8_t diff, max_vec, zeros;
+    uint16x8_t comp1, comp2;
+    zeros = vdupq_n_s16(0);
+    for (number = 0; number < eighth_points; ++number) {
+        input_vec = vld2q_s16(src0);
+        //__VOLK_PREFETCH(src0+16);
+        diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
+        comp1 = vcgeq_s16(diff, zeros);
+        comp2 = vcltq_s16(diff, zeros);
+
+        input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1);
+        input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2);
+
+        max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]);
+        vst1q_s16(target, max_vec);
+        src0 += 16;
+        target += 8;
+    }
+    for (number = 0; number < num_points % 16; number += 2) {
+        target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0)
+                                  ? src0[number]
+                                  : src0[number + 1];
+    }
  }
  #endif /* LV_HAVE_NEON */
  
  #ifdef LV_HAVE_NEONV7
-extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target, int16_t* src0, unsigned int num_points);
+extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target,
+                                                       int16_t* src0,
+                                                       unsigned int num_points);
  #endif /* LV_HAVE_NEONV7 */
  
  #ifdef LV_HAVE_GENERIC
-static inline void
-volk_16i_max_star_horizontal_16i_generic(int16_t* target, int16_t* src0, unsigned int num_points)
+static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target,
+                                                            int16_t* src0,
+                                                            unsigned int num_points)
  {
-  const unsigned int num_bytes = num_points*2;
+    const unsigned int num_bytes = num_points * 2;
  
-  int i = 0;
+    int i = 0;
  
-  int bound = num_bytes >> 1;
+    int bound = num_bytes >> 1;
  
-  for(i = 0; i < bound; i += 2) {
-    target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1];
-  }
+    for (i = 0; i < bound; i += 2) {
+        target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
+    }
  }
  
  #endif /*LV_HAVE_GENERIC*/
diff --git a/kernels/volk/volk_16i_permute_and_scalar_add.h b/kernels/volk/volk_16i_permute_and_scalar_add.h

index 7fcdad3dd6e2f5396ba5405a38e9d15480e99b1a..0563f07e64c3242c9eb7a0b6f682c83e45afe04d 100644 (file)
--- a/kernels/volk/volk_16i_permute_and_scalar_add.h
+++ b/kernels/volk/volk_16i_permute_and_scalar_add.h
@@ -29,8 +29,9 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_16i_permute_and_scalar_add(short* target,  short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_points)
- * \endcode
+ * void volk_16i_permute_and_scalar_add(short* target,  short* src0, short*
+ * permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short*
+ * scalars, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li src0: The input vector.
@@ -58,137 +59,143 @@
  #ifndef INCLUDED_volk_16i_permute_and_scalar_add_a_H
  #define INCLUDED_volk_16i_permute_and_scalar_add_a_H
  
-#include<inttypes.h>
-#include<stdio.h>
+#include <inttypes.h>
+#include <stdio.h>
  
  #ifdef LV_HAVE_SSE2
  
-#include<xmmintrin.h>
-#include<emmintrin.h>
-
-static inline void
-volk_16i_permute_and_scalar_add_a_sse2(short* target,  short* src0, short* permute_indexes,
-                                       short* cntl0, short* cntl1, short* cntl2, short* cntl3,
-                                       short* scalars, unsigned int num_points)
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target,
+                                                          short* src0,
+                                                          short* permute_indexes,
+                                                          short* cntl0,
+                                                          short* cntl1,
+                                                          short* cntl2,
+                                                          short* cntl3,
+                                                          short* scalars,
+                                                          unsigned int num_points)
  {
  
-  const unsigned int num_bytes = num_points*2;
+    const unsigned int num_bytes = num_points * 2;
  
-  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+    __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
  
-  __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
+    __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
  
-  short* p_permute_indexes = permute_indexes;
+    short* p_permute_indexes = permute_indexes;
  
-  p_target = (__m128i*)target;
-  p_cntl0 = (__m128i*)cntl0;
-  p_cntl1 = (__m128i*)cntl1;
-  p_cntl2 = (__m128i*)cntl2;
-  p_cntl3 = (__m128i*)cntl3;
-  p_scalars = (__m128i*)scalars;
+    p_target = (__m128i*)target;
+    p_cntl0 = (__m128i*)cntl0;
+    p_cntl1 = (__m128i*)cntl1;
+    p_cntl2 = (__m128i*)cntl2;
+    p_cntl3 = (__m128i*)cntl3;
+    p_scalars = (__m128i*)scalars;
  
-  int i = 0;
+    int i = 0;
  
-  int bound = (num_bytes >> 4);
-  int leftovers = (num_bytes >> 1) & 7;
+    int bound = (num_bytes >> 4);
+    int leftovers = (num_bytes >> 1) & 7;
  
-  xmm0 = _mm_load_si128(p_scalars);
+    xmm0 = _mm_load_si128(p_scalars);
  
-  xmm1 = _mm_shufflelo_epi16(xmm0, 0);
-  xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
-  xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
-  xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+    xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+    xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+    xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+    xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
  
-  xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
-  xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
-  xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
-  xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+    xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+    xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+    xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+    xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
  
  
-  for(; i < bound; ++i) {
-    xmm0 = _mm_setzero_si128();
-    xmm5 = _mm_setzero_si128();
-    xmm6 = _mm_setzero_si128();
-    xmm7 = _mm_setzero_si128();
+    for (; i < bound; ++i) {
+        xmm0 = _mm_setzero_si128();
+        xmm5 = _mm_setzero_si128();
+        xmm6 = _mm_setzero_si128();
+        xmm7 = _mm_setzero_si128();
  
-    xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
-    xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
-    xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
-    xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
-    xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
-    xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
-    xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
-    xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
+        xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
+        xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
+        xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
+        xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
+        xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
+        xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
+        xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
+        xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
  
-    xmm0 = _mm_add_epi16(xmm0, xmm5);
-    xmm6 = _mm_add_epi16(xmm6, xmm7);
+        xmm0 = _mm_add_epi16(xmm0, xmm5);
+        xmm6 = _mm_add_epi16(xmm6, xmm7);
  
-    p_permute_indexes += 8;
+        p_permute_indexes += 8;
  
-    xmm0 = _mm_add_epi16(xmm0, xmm6);
+        xmm0 = _mm_add_epi16(xmm0, xmm6);
  
-    xmm5 = _mm_load_si128(p_cntl0);
-    xmm6 = _mm_load_si128(p_cntl1);
-    xmm7 = _mm_load_si128(p_cntl2);
+        xmm5 = _mm_load_si128(p_cntl0);
+        xmm6 = _mm_load_si128(p_cntl1);
+        xmm7 = _mm_load_si128(p_cntl2);
  
-    xmm5 = _mm_and_si128(xmm5, xmm1);
-    xmm6 = _mm_and_si128(xmm6, xmm2);
-    xmm7 = _mm_and_si128(xmm7, xmm3);
+        xmm5 = _mm_and_si128(xmm5, xmm1);
+        xmm6 = _mm_and_si128(xmm6, xmm2);
+        xmm7 = _mm_and_si128(xmm7, xmm3);
  
-    xmm0 = _mm_add_epi16(xmm0, xmm5);
+        xmm0 = _mm_add_epi16(xmm0, xmm5);
  
-    xmm5 = _mm_load_si128(p_cntl3);
+        xmm5 = _mm_load_si128(p_cntl3);
  
-    xmm6 = _mm_add_epi16(xmm6, xmm7);
+        xmm6 = _mm_add_epi16(xmm6, xmm7);
  
-    p_cntl0 += 1;
+        p_cntl0 += 1;
  
-    xmm5 = _mm_and_si128(xmm5, xmm4);
+        xmm5 = _mm_and_si128(xmm5, xmm4);
  
-    xmm0 = _mm_add_epi16(xmm0, xmm6);
+        xmm0 = _mm_add_epi16(xmm0, xmm6);
  
-    p_cntl1 += 1;
-    p_cntl2 += 1;
+        p_cntl1 += 1;
+        p_cntl2 += 1;
  
-    xmm0 = _mm_add_epi16(xmm0, xmm5);
+        xmm0 = _mm_add_epi16(xmm0, xmm5);
  
-    p_cntl3 += 1;
+        p_cntl3 += 1;
  
-    _mm_store_si128(p_target, xmm0);
+        _mm_store_si128(p_target, xmm0);
  
-    p_target += 1;
-  }
+        p_target += 1;
+    }
  
-  for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
-    target[i] = src0[permute_indexes[i]]
-      + (cntl0[i] & scalars[0])
-      + (cntl1[i] & scalars[1])
-      + (cntl2[i] & scalars[2])
-      + (cntl3[i] & scalars[3]);
-  }
+    for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+        target[i] = src0[permute_indexes[i]] + (cntl0[i] & scalars[0]) +
+                    (cntl1[i] & scalars[1]) + (cntl2[i] & scalars[2]) +
+                    (cntl3[i] & scalars[3]);
+    }
  }
  #endif /*LV_HAVE_SSE*/
  
  
  #ifdef LV_HAVE_GENERIC
-static inline void
-volk_16i_permute_and_scalar_add_generic(short* target, short* src0, short* permute_indexes,
-                                        short* cntl0, short* cntl1, short* cntl2, short* cntl3,
-                                        short* scalars, unsigned int num_points)
+static inline void volk_16i_permute_and_scalar_add_generic(short* target,
+                                                           short* src0,
+                                                           short* permute_indexes,
+                                                           short* cntl0,
+                                                           short* cntl1,
+                                                           short* cntl2,
+                                                           short* cntl3,
+                                                           short* scalars,
+                                                           unsigned int num_points)
  {
-  const unsigned int num_bytes = num_points*2;
+    const unsigned int num_bytes = num_points * 2;
  
-  int i = 0;
+    int i = 0;
  
-  int bound = num_bytes >> 1;
+    int bound = num_bytes >> 1;
  
-  for(i = 0; i < bound; ++i) {
-    target[i] = src0[permute_indexes[i]]
-      + (cntl0[i] & scalars[0])
-      + (cntl1[i] & scalars[1])
-      + (cntl2[i] & scalars[2])
-      + (cntl3[i] & scalars[3]);
-  }
+    for (i = 0; i < bound; ++i) {
+        target[i] = src0[permute_indexes[i]] + (cntl0[i] & scalars[0]) +
+                    (cntl1[i] & scalars[1]) + (cntl2[i] & scalars[2]) +
+                    (cntl3[i] & scalars[3]);
+    }
  }
  
  #endif /*LV_HAVE_GENERIC*/
diff --git a/kernels/volk/volk_16i_s32f_convert_32f.h b/kernels/volk/volk_16i_s32f_convert_32f.h

index 38ea6f5482a056aec1f9f73868dd73a0b350bce3..3fd3a770e33321609ed43824079f20b76a8f1bc7 100644 (file)
--- a/kernels/volk/volk_16i_s32f_convert_32f.h
+++ b/kernels/volk/volk_16i_s32f_convert_32f.h
@@ -29,8 +29,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_16i_s32f_convert_32f(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points);
- * \endcode
+ * void volk_16i_s32f_convert_32f(float* outputVector, const int16_t* inputVector, const
+ * float scalar, unsigned int num_points); \endcode
   *
   * \b Inputs
   * \li inputVector: The input vector of 16-bit shorts.
@@ -60,238 +60,247 @@
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_16i_s32f_convert_32f_u_avx2(float* outputVector, const int16_t* inputVector,
-                                const float scalar, unsigned int num_points)
+static inline void volk_16i_s32f_convert_32f_u_avx2(float* outputVector,
+                                                    const int16_t* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  float* outputVectorPtr = outputVector;
-  __m256 invScalar = _mm256_set1_ps(1.0/scalar);
-  int16_t* inputPtr = (int16_t*)inputVector;
-  __m128i inputVal;
-  __m256i inputVal2;
-  __m256 ret;
+    float* outputVectorPtr = outputVector;
+    __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal;
+    __m256i inputVal2;
+    __m256 ret;
  
-  for(;number < eighthPoints; number++){
+    for (; number < eighthPoints; number++) {
  
-    // Load the 8 values
-    inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+        // Load the 8 values
+        inputVal = _mm_loadu_si128((__m128i*)inputPtr);
  
-    // Convert
-    inputVal2 = _mm256_cvtepi16_epi32(inputVal);
+        // Convert
+        inputVal2 = _mm256_cvtepi16_epi32(inputVal);
  
-    ret = _mm256_cvtepi32_ps(inputVal2);
-    ret = _mm256_mul_ps(ret, invScalar);
+        ret = _mm256_cvtepi32_ps(inputVal2);
+        ret = _mm256_mul_ps(ret, invScalar);
  
-    _mm256_storeu_ps(outputVectorPtr, ret);
+        _mm256_storeu_ps(outputVectorPtr, ret);
  
-    outputVectorPtr += 8;
+        outputVectorPtr += 8;
  
-    inputPtr += 8;
-  }
+        inputPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    outputVector[number] =((float)(inputVector[number])) / scalar;
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        outputVector[number] = ((float)(inputVector[number])) / scalar;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_16i_s32f_convert_32f_u_avx(float* outputVector, const int16_t* inputVector,
-                                const float scalar, unsigned int num_points)
+static inline void volk_16i_s32f_convert_32f_u_avx(float* outputVector,
+                                                   const int16_t* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  float* outputVectorPtr = outputVector;
-  __m128 invScalar = _mm_set_ps1(1.0/scalar);
-  int16_t* inputPtr = (int16_t*)inputVector;
-  __m128i inputVal, inputVal2;
-  __m128 ret;
-  __m256 output;
-  __m256 dummy = _mm256_setzero_ps();
+    float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal, inputVal2;
+    __m128 ret;
+    __m256 output;
+    __m256 dummy = _mm256_setzero_ps();
  
-  for(;number < eighthPoints; number++){
+    for (; number < eighthPoints; number++) {
  
-    // Load the 8 values
-    //inputVal = _mm_loadu_si128((__m128i*)inputPtr);
-    inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+        // Load the 8 values
+        // inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+        inputVal = _mm_loadu_si128((__m128i*)inputPtr);
  
-    // Shift the input data to the right by 64 bits ( 8 bytes )
-    inputVal2 = _mm_srli_si128(inputVal, 8);
+        // Shift the input data to the right by 64 bits ( 8 bytes )
+        inputVal2 = _mm_srli_si128(inputVal, 8);
  
-    // Convert the lower 4 values into 32 bit words
-    inputVal = _mm_cvtepi16_epi32(inputVal);
-    inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+        // Convert the lower 4 values into 32 bit words
+        inputVal = _mm_cvtepi16_epi32(inputVal);
+        inputVal2 = _mm_cvtepi16_epi32(inputVal2);
  
-    ret = _mm_cvtepi32_ps(inputVal);
-    ret = _mm_mul_ps(ret, invScalar);
-    output = _mm256_insertf128_ps(dummy, ret, 0);
+        ret = _mm_cvtepi32_ps(inputVal);
+        ret = _mm_mul_ps(ret, invScalar);
+        output = _mm256_insertf128_ps(dummy, ret, 0);
  
-    ret = _mm_cvtepi32_ps(inputVal2);
-    ret = _mm_mul_ps(ret, invScalar);
-    output = _mm256_insertf128_ps(output, ret, 1);
+        ret = _mm_cvtepi32_ps(inputVal2);
+        ret = _mm_mul_ps(ret, invScalar);
+        output = _mm256_insertf128_ps(output, ret, 1);
  
-    _mm256_storeu_ps(outputVectorPtr, output);
+        _mm256_storeu_ps(outputVectorPtr, output);
  
-    outputVectorPtr += 8;
+        outputVectorPtr += 8;
  
-    inputPtr += 8;
-  }
+        inputPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    outputVector[number] =((float)(inputVector[number])) / scalar;
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        outputVector[number] = ((float)(inputVector[number])) / scalar;
+    }
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_SSE4_1
  #include <smmintrin.h>
  
-static inline void
-volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector,
-                                   const float scalar, unsigned int num_points)
+static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector,
+                                                      const int16_t* inputVector,
+                                                      const float scalar,
+                                                      unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  float* outputVectorPtr = outputVector;
-  __m128 invScalar = _mm_set_ps1(1.0/scalar);
-  int16_t* inputPtr = (int16_t*)inputVector;
-  __m128i inputVal;
-  __m128i inputVal2;
-  __m128 ret;
+    float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal;
+    __m128i inputVal2;
+    __m128 ret;
  
-  for(;number < eighthPoints; number++){
+    for (; number < eighthPoints; number++) {
  
-    // Load the 8 values
-    inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+        // Load the 8 values
+        inputVal = _mm_loadu_si128((__m128i*)inputPtr);
  
-    // Shift the input data to the right by 64 bits ( 8 bytes )
-    inputVal2 = _mm_srli_si128(inputVal, 8);
+        // Shift the input data to the right by 64 bits ( 8 bytes )
+        inputVal2 = _mm_srli_si128(inputVal, 8);
  
-    // Convert the lower 4 values into 32 bit words
-    inputVal = _mm_cvtepi16_epi32(inputVal);
-    inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+        // Convert the lower 4 values into 32 bit words
+        inputVal = _mm_cvtepi16_epi32(inputVal);
+        inputVal2 = _mm_cvtepi16_epi32(inputVal2);
  
-    ret = _mm_cvtepi32_ps(inputVal);
-    ret = _mm_mul_ps(ret, invScalar);
-    _mm_storeu_ps(outputVectorPtr, ret);
-    outputVectorPtr += 4;
+        ret = _mm_cvtepi32_ps(inputVal);
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_storeu_ps(outputVectorPtr, ret);
+        outputVectorPtr += 4;
  
-    ret = _mm_cvtepi32_ps(inputVal2);
-    ret = _mm_mul_ps(ret, invScalar);
-    _mm_storeu_ps(outputVectorPtr, ret);
+        ret = _mm_cvtepi32_ps(inputVal2);
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_storeu_ps(outputVectorPtr, ret);
  
-    outputVectorPtr += 4;
+        outputVectorPtr += 4;
  
-    inputPtr += 8;
-  }
+        inputPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    outputVector[number] =((float)(inputVector[number])) / scalar;
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        outputVector[number] = ((float)(inputVector[number])) / scalar;
+    }
  }
  #endif /* LV_HAVE_SSE4_1 */
  
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector,
-                                const float scalar, unsigned int num_points)
+static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector,
+                                                   const int16_t* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  float* outputVectorPtr = outputVector;
-  __m128 invScalar = _mm_set_ps1(1.0/scalar);
-  int16_t* inputPtr = (int16_t*)inputVector;
-  __m128 ret;
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
-
-    ret = _mm_mul_ps(ret, invScalar);
-    _mm_storeu_ps(outputVectorPtr, ret);
-
-    inputPtr += 4;
-    outputVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    outputVector[number] = (float)(inputVector[number]) / scalar;
-  }
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128 ret;
+
+    for (; number < quarterPoints; number++) {
+        ret = _mm_set_ps((float)(inputPtr[3]),
+                         (float)(inputPtr[2]),
+                         (float)(inputPtr[1]),
+                         (float)(inputPtr[0]));
+
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_storeu_ps(outputVectorPtr, ret);
+
+        inputPtr += 4;
+        outputVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        outputVector[number] = (float)(inputVector[number]) / scalar;
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector,
-                                  const float scalar, unsigned int num_points)
+static inline void volk_16i_s32f_convert_32f_generic(float* outputVector,
+                                                     const int16_t* inputVector,
+                                                     const float scalar,
+                                                     unsigned int num_points)
  {
-  float* outputVectorPtr = outputVector;
-  const int16_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
+    float* outputVectorPtr = outputVector;
+    const int16_t* inputVectorPtr = inputVector;
+    unsigned int number = 0;
  
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
-  }
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_16i_s32f_convert_32f_neon(float* outputVector, const int16_t* inputVector,
-                               const float scalar, unsigned int num_points)
+static inline void volk_16i_s32f_convert_32f_neon(float* outputVector,
+                                                  const int16_t* inputVector,
+                                                  const float scalar,
+                                                  unsigned int num_points)
  {
-  float* outputPtr = outputVector;
-  const int16_t* inputPtr = inputVector;
-  unsigned int number = 0;
-  unsigned int eighth_points = num_points / 8;
-
-  int16x4x2_t input16;
-  int32x4_t input32_0, input32_1;
-  float32x4_t input_float_0, input_float_1;
-  float32x4x2_t output_float;
-  float32x4_t inv_scale;
-
-  inv_scale = vdupq_n_f32(1.0/scalar);
-
-  // the generic disassembles to a 128-bit load
-  // and duplicates every instruction to operate on 64-bits
-  // at a time. This is only possible with lanes, which is faster
-  // than just doing a vld1_s16, but still slower.
-  for(number = 0; number < eighth_points; number++){
-    input16 = vld2_s16(inputPtr);
-    // widen 16-bit int to 32-bit int
-    input32_0 = vmovl_s16(input16.val[0]);
-    input32_1 = vmovl_s16(input16.val[1]);
-    // convert 32-bit int to float with scale
-    input_float_0 = vcvtq_f32_s32(input32_0);
-    input_float_1 = vcvtq_f32_s32(input32_1);
-    output_float.val[0] = vmulq_f32(input_float_0, inv_scale);
-    output_float.val[1] = vmulq_f32(input_float_1, inv_scale);
-    vst2q_f32(outputPtr, output_float);
-    inputPtr += 8;
-    outputPtr += 8;
-  }
-
-  for(number = eighth_points*8; number < num_points; number++){
-    *outputPtr++ = ((float)(*inputPtr++)) / scalar;
-  }
+    float* outputPtr = outputVector;
+    const int16_t* inputPtr = inputVector;
+    unsigned int number = 0;
+    unsigned int eighth_points = num_points / 8;
+
+    int16x4x2_t input16;
+    int32x4_t input32_0, input32_1;
+    float32x4_t input_float_0, input_float_1;
+    float32x4x2_t output_float;
+    float32x4_t inv_scale;
+
+    inv_scale = vdupq_n_f32(1.0 / scalar);
+
+    // the generic disassembles to a 128-bit load
+    // and duplicates every instruction to operate on 64-bits
+    // at a time. This is only possible with lanes, which is faster
+    // than just doing a vld1_s16, but still slower.
+    for (number = 0; number < eighth_points; number++) {
+        input16 = vld2_s16(inputPtr);
+        // widen 16-bit int to 32-bit int
+        input32_0 = vmovl_s16(input16.val[0]);
+        input32_1 = vmovl_s16(input16.val[1]);
+        // convert 32-bit int to float with scale
+        input_float_0 = vcvtq_f32_s32(input32_0);
+        input_float_1 = vcvtq_f32_s32(input32_1);
+        output_float.val[0] = vmulq_f32(input_float_0, inv_scale);
+        output_float.val[1] = vmulq_f32(input_float_1, inv_scale);
+        vst2q_f32(outputPtr, output_float);
+        inputPtr += 8;
+        outputPtr += 8;
+    }
+
+    for (number = eighth_points * 8; number < num_points; number++) {
+        *outputPtr++ = ((float)(*inputPtr++)) / scalar;
+    }
  }
  #endif /* LV_HAVE_NEON */
  
@@ -306,193 +315,201 @@ volk_16i_s32f_convert_32f_neon(float* outputVector, const int16_t* inputVector,
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_16i_s32f_convert_32f_a_avx2(float* outputVector, const int16_t* inputVector,
-                                const float scalar, unsigned int num_points)
+static inline void volk_16i_s32f_convert_32f_a_avx2(float* outputVector,
+                                                    const int16_t* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  float* outputVectorPtr = outputVector;
-  __m256 invScalar = _mm256_set1_ps(1.0/scalar);
-  int16_t* inputPtr = (int16_t*)inputVector;
-  __m128i inputVal;
-  __m256i inputVal2;
-  __m256 ret;
+    float* outputVectorPtr = outputVector;
+    __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal;
+    __m256i inputVal2;
+    __m256 ret;
  
-  for(;number < eighthPoints; number++){
+    for (; number < eighthPoints; number++) {
  
-    // Load the 8 values
-    inputVal = _mm_load_si128((__m128i*)inputPtr);
+        // Load the 8 values
+        inputVal = _mm_load_si128((__m128i*)inputPtr);
  
-    // Convert
-    inputVal2 = _mm256_cvtepi16_epi32(inputVal);
+        // Convert
+        inputVal2 = _mm256_cvtepi16_epi32(inputVal);
  
-    ret = _mm256_cvtepi32_ps(inputVal2);
-    ret = _mm256_mul_ps(ret, invScalar);
+        ret = _mm256_cvtepi32_ps(inputVal2);
+        ret = _mm256_mul_ps(ret, invScalar);
  
-    _mm256_store_ps(outputVectorPtr, ret);
+        _mm256_store_ps(outputVectorPtr, ret);
  
-    outputVectorPtr += 8;
+        outputVectorPtr += 8;
  
-    inputPtr += 8;
-  }
+        inputPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    outputVector[number] =((float)(inputVector[number])) / scalar;
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        outputVector[number] = ((float)(inputVector[number])) / scalar;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_16i_s32f_convert_32f_a_avx(float* outputVector, const int16_t* inputVector,
-                                const float scalar, unsigned int num_points)
+static inline void volk_16i_s32f_convert_32f_a_avx(float* outputVector,
+                                                   const int16_t* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  float* outputVectorPtr = outputVector;
-  __m128 invScalar = _mm_set_ps1(1.0/scalar);
-  int16_t* inputPtr = (int16_t*)inputVector;
-  __m128i inputVal, inputVal2;
-  __m128 ret;
-  __m256 output;
-  __m256 dummy = _mm256_setzero_ps();
+    float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal, inputVal2;
+    __m128 ret;
+    __m256 output;
+    __m256 dummy = _mm256_setzero_ps();
  
-  for(;number < eighthPoints; number++){
+    for (; number < eighthPoints; number++) {
  
-    // Load the 8 values
-    //inputVal = _mm_loadu_si128((__m128i*)inputPtr);
-    inputVal = _mm_load_si128((__m128i*)inputPtr);
+        // Load the 8 values
+        // inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+        inputVal = _mm_load_si128((__m128i*)inputPtr);
  
-    // Shift the input data to the right by 64 bits ( 8 bytes )
-    inputVal2 = _mm_srli_si128(inputVal, 8);
+        // Shift the input data to the right by 64 bits ( 8 bytes )
+        inputVal2 = _mm_srli_si128(inputVal, 8);
  
-    // Convert the lower 4 values into 32 bit words
-    inputVal = _mm_cvtepi16_epi32(inputVal);
-    inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+        // Convert the lower 4 values into 32 bit words
+        inputVal = _mm_cvtepi16_epi32(inputVal);
+        inputVal2 = _mm_cvtepi16_epi32(inputVal2);
  
-    ret = _mm_cvtepi32_ps(inputVal);
-    ret = _mm_mul_ps(ret, invScalar);
-    output = _mm256_insertf128_ps(dummy, ret, 0);
+        ret = _mm_cvtepi32_ps(inputVal);
+        ret = _mm_mul_ps(ret, invScalar);
+        output = _mm256_insertf128_ps(dummy, ret, 0);
  
-    ret = _mm_cvtepi32_ps(inputVal2);
-    ret = _mm_mul_ps(ret, invScalar);
-    output = _mm256_insertf128_ps(output, ret, 1);
+        ret = _mm_cvtepi32_ps(inputVal2);
+        ret = _mm_mul_ps(ret, invScalar);
+        output = _mm256_insertf128_ps(output, ret, 1);
  
-    _mm256_store_ps(outputVectorPtr, output);
+        _mm256_store_ps(outputVectorPtr, output);
  
-    outputVectorPtr += 8;
+        outputVectorPtr += 8;
  
-    inputPtr += 8;
-  }
+        inputPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    outputVector[number] =((float)(inputVector[number])) / scalar;
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        outputVector[number] = ((float)(inputVector[number])) / scalar;
+    }
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_SSE4_1
  #include <smmintrin.h>
  
-static inline void
-volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector,
-                                   const float scalar, unsigned int num_points)
+static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector,
+                                                      const int16_t* inputVector,
+                                                      const float scalar,
+                                                      unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  float* outputVectorPtr = outputVector;
-  __m128 invScalar = _mm_set_ps1(1.0/scalar);
-  int16_t* inputPtr = (int16_t*)inputVector;
-  __m128i inputVal;
-  __m128i inputVal2;
-  __m128 ret;
+    float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal;
+    __m128i inputVal2;
+    __m128 ret;
  
-  for(;number < eighthPoints; number++){
+    for (; number < eighthPoints; number++) {
  
-    // Load the 8 values
-    inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+        // Load the 8 values
+        inputVal = _mm_loadu_si128((__m128i*)inputPtr);
  
-    // Shift the input data to the right by 64 bits ( 8 bytes )
-    inputVal2 = _mm_srli_si128(inputVal, 8);
+        // Shift the input data to the right by 64 bits ( 8 bytes )
+        inputVal2 = _mm_srli_si128(inputVal, 8);
  
-    // Convert the lower 4 values into 32 bit words
-    inputVal = _mm_cvtepi16_epi32(inputVal);
-    inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+        // Convert the lower 4 values into 32 bit words
+        inputVal = _mm_cvtepi16_epi32(inputVal);
+        inputVal2 = _mm_cvtepi16_epi32(inputVal2);
  
-    ret = _mm_cvtepi32_ps(inputVal);
-    ret = _mm_mul_ps(ret, invScalar);
-    _mm_storeu_ps(outputVectorPtr, ret);
-    outputVectorPtr += 4;
+        ret = _mm_cvtepi32_ps(inputVal);
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_storeu_ps(outputVectorPtr, ret);
+        outputVectorPtr += 4;
  
-    ret = _mm_cvtepi32_ps(inputVal2);
-    ret = _mm_mul_ps(ret, invScalar);
-    _mm_storeu_ps(outputVectorPtr, ret);
+        ret = _mm_cvtepi32_ps(inputVal2);
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_storeu_ps(outputVectorPtr, ret);
  
-    outputVectorPtr += 4;
+        outputVectorPtr += 4;
  
-    inputPtr += 8;
-  }
+        inputPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    outputVector[number] =((float)(inputVector[number])) / scalar;
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        outputVector[number] = ((float)(inputVector[number])) / scalar;
+    }
  }
  #endif /* LV_HAVE_SSE4_1 */
  
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector,
-                                const float scalar, unsigned int num_points)
+static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector,
+                                                   const int16_t* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  float* outputVectorPtr = outputVector;
-  __m128 invScalar = _mm_set_ps1(1.0/scalar);
-  int16_t* inputPtr = (int16_t*)inputVector;
-  __m128 ret;
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
-
-    ret = _mm_mul_ps(ret, invScalar);
-    _mm_storeu_ps(outputVectorPtr, ret);
-
-    inputPtr += 4;
-    outputVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    outputVector[number] = (float)(inputVector[number]) / scalar;
-  }
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128 ret;
+
+    for (; number < quarterPoints; number++) {
+        ret = _mm_set_ps((float)(inputPtr[3]),
+                         (float)(inputPtr[2]),
+                         (float)(inputPtr[1]),
+                         (float)(inputPtr[0]));
+
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_storeu_ps(outputVectorPtr, ret);
+
+        inputPtr += 4;
+        outputVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        outputVector[number] = (float)(inputVector[number]) / scalar;
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector,
-                                    const float scalar, unsigned int num_points)
+static inline void volk_16i_s32f_convert_32f_a_generic(float* outputVector,
+                                                       const int16_t* inputVector,
+                                                       const float scalar,
+                                                       unsigned int num_points)
  {
-  float* outputVectorPtr = outputVector;
-  const int16_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
+    float* outputVectorPtr = outputVector;
+    const int16_t* inputVectorPtr = inputVector;
+    unsigned int number = 0;
  
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
-  }
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
diff --git a/kernels/volk/volk_16i_x4_quad_max_star_16i.h b/kernels/volk/volk_16i_x4_quad_max_star_16i.h

index 6aa74c70a103f5ba83d2a1a5ae82b8d34f1af2a7..619cc90bd7fd415e3a4f275b220f029a1abc2ee9 100644 (file)
--- a/kernels/volk/volk_16i_x4_quad_max_star_16i.h
+++ b/kernels/volk/volk_16i_x4_quad_max_star_16i.h
@@ -29,8 +29,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_16i_x4_quad_max_star_16i(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points)
- * \endcode
+ * void volk_16i_x4_quad_max_star_16i(short* target, short* src0, short* src1, short*
+ * src2, short* src3, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li src0: The input vector 0.
@@ -55,149 +55,152 @@
  #ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
  #define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
  
-#include<inttypes.h>
-#include<stdio.h>
+#include <inttypes.h>
+#include <stdio.h>
  
  #ifdef LV_HAVE_SSE2
  
-#include<emmintrin.h>
+#include <emmintrin.h>
  
-static inline void
-volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1,
-                                     short* src2, short* src3, unsigned int num_points)
+static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target,
+                                                        short* src0,
+                                                        short* src1,
+                                                        short* src2,
+                                                        short* src3,
+                                                        unsigned int num_points)
  {
-  const unsigned int num_bytes = num_points*2;
-
-  int i = 0;
+    const unsigned int num_bytes = num_points * 2;
  
-  int bound = (num_bytes >> 4);
-  int bound_copy = bound;
-  int leftovers = (num_bytes >> 1) & 7;
+    int i = 0;
  
-  __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
-  p_target = (__m128i*) target;
-  p_src0 =  (__m128i*)src0;
-  p_src1 =  (__m128i*)src1;
-  p_src2 =  (__m128i*)src2;
-  p_src3 =  (__m128i*)src3;
+    int bound = (num_bytes >> 4);
+    int bound_copy = bound;
+    int leftovers = (num_bytes >> 1) & 7;
  
-  __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+    __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
+    p_target = (__m128i*)target;
+    p_src0 = (__m128i*)src0;
+    p_src1 = (__m128i*)src1;
+    p_src2 = (__m128i*)src2;
+    p_src3 = (__m128i*)src3;
  
-  while(bound_copy > 0) {
-    xmm1 = _mm_load_si128(p_src0);
-    xmm2 = _mm_load_si128(p_src1);
-    xmm3 = _mm_load_si128(p_src2);
-    xmm4 = _mm_load_si128(p_src3);
+    __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
  
-    xmm5 = _mm_setzero_si128();
-    xmm6 = _mm_setzero_si128();
-    xmm7 = xmm1;
-    xmm8 = xmm3;
+    while (bound_copy > 0) {
+        xmm1 = _mm_load_si128(p_src0);
+        xmm2 = _mm_load_si128(p_src1);
+        xmm3 = _mm_load_si128(p_src2);
+        xmm4 = _mm_load_si128(p_src3);
  
-    xmm1 = _mm_sub_epi16(xmm2, xmm1);
+        xmm5 = _mm_setzero_si128();
+        xmm6 = _mm_setzero_si128();
+        xmm7 = xmm1;
+        xmm8 = xmm3;
  
-    xmm3 = _mm_sub_epi16(xmm4, xmm3);
+        xmm1 = _mm_sub_epi16(xmm2, xmm1);
  
-    xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
-    xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
+        xmm3 = _mm_sub_epi16(xmm4, xmm3);
  
-    xmm2 = _mm_and_si128(xmm5, xmm2);
-    xmm4 = _mm_and_si128(xmm6, xmm4);
-    xmm5 = _mm_andnot_si128(xmm5, xmm7);
-    xmm6 = _mm_andnot_si128(xmm6, xmm8);
+        xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
+        xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
  
-    xmm5 = _mm_add_epi16(xmm2, xmm5);
-    xmm6 = _mm_add_epi16(xmm4, xmm6);
+        xmm2 = _mm_and_si128(xmm5, xmm2);
+        xmm4 = _mm_and_si128(xmm6, xmm4);
+        xmm5 = _mm_andnot_si128(xmm5, xmm7);
+        xmm6 = _mm_andnot_si128(xmm6, xmm8);
  
-    xmm1 = _mm_xor_si128(xmm1, xmm1);
-    xmm2 = xmm5;
-    xmm5 = _mm_sub_epi16(xmm6, xmm5);
-    p_src0 += 1;
-    bound_copy -= 1;
+        xmm5 = _mm_add_epi16(xmm2, xmm5);
+        xmm6 = _mm_add_epi16(xmm4, xmm6);
  
-    xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
-    p_src1 += 1;
+        xmm1 = _mm_xor_si128(xmm1, xmm1);
+        xmm2 = xmm5;
+        xmm5 = _mm_sub_epi16(xmm6, xmm5);
+        p_src0 += 1;
+        bound_copy -= 1;
  
-    xmm6 = _mm_and_si128(xmm1, xmm6);
+        xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
+        p_src1 += 1;
  
-    xmm1 = _mm_andnot_si128(xmm1, xmm2);
-    p_src2 += 1;
+        xmm6 = _mm_and_si128(xmm1, xmm6);
  
-    xmm1 = _mm_add_epi16(xmm6, xmm1);
-    p_src3 += 1;
+        xmm1 = _mm_andnot_si128(xmm1, xmm2);
+        p_src2 += 1;
  
-    _mm_store_si128(p_target, xmm1);
-    p_target += 1;
+        xmm1 = _mm_add_epi16(xmm6, xmm1);
+        p_src3 += 1;
  
-  }
+        _mm_store_si128(p_target, xmm1);
+        p_target += 1;
+    }
  
  
-  /*__VOLK_ASM __VOLK_VOLATILE
-    (
-    "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t"
-    "cmp $0, %[bound]\n\t"
-    "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t"
+    /*__VOLK_ASM __VOLK_VOLATILE
+      (
+      "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t"
+      "cmp $0, %[bound]\n\t"
+      "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t"
  
-    "movaps (%[src0]), %%xmm1\n\t"
-    "movaps (%[src1]), %%xmm2\n\t"
-    "movaps (%[src2]), %%xmm3\n\t"
-    "movaps (%[src3]), %%xmm4\n\t"
+      "movaps (%[src0]), %%xmm1\n\t"
+      "movaps (%[src1]), %%xmm2\n\t"
+      "movaps (%[src2]), %%xmm3\n\t"
+      "movaps (%[src3]), %%xmm4\n\t"
  
-    "pxor %%xmm5, %%xmm5\n\t"
-    "pxor %%xmm6, %%xmm6\n\t"
-    "movaps %%xmm1, %%xmm7\n\t"
-    "movaps %%xmm3, %%xmm8\n\t"
-    "psubw %%xmm2, %%xmm1\n\t"
-    "psubw %%xmm4, %%xmm3\n\t"
+      "pxor %%xmm5, %%xmm5\n\t"
+      "pxor %%xmm6, %%xmm6\n\t"
+      "movaps %%xmm1, %%xmm7\n\t"
+      "movaps %%xmm3, %%xmm8\n\t"
+      "psubw %%xmm2, %%xmm1\n\t"
+      "psubw %%xmm4, %%xmm3\n\t"
  
-    "pcmpgtw %%xmm1, %%xmm5\n\t"
-    "pcmpgtw %%xmm3, %%xmm6\n\t"
+      "pcmpgtw %%xmm1, %%xmm5\n\t"
+      "pcmpgtw %%xmm3, %%xmm6\n\t"
  
-    "pand %%xmm5, %%xmm2\n\t"
-    "pand %%xmm6, %%xmm4\n\t"
-    "pandn %%xmm7, %%xmm5\n\t"
-    "pandn %%xmm8, %%xmm6\n\t"
+      "pand %%xmm5, %%xmm2\n\t"
+      "pand %%xmm6, %%xmm4\n\t"
+      "pandn %%xmm7, %%xmm5\n\t"
+      "pandn %%xmm8, %%xmm6\n\t"
  
-    "paddw %%xmm2, %%xmm5\n\t"
-    "paddw %%xmm4, %%xmm6\n\t"
+      "paddw %%xmm2, %%xmm5\n\t"
+      "paddw %%xmm4, %%xmm6\n\t"
  
-    "pxor %%xmm1, %%xmm1\n\t"
-    "movaps %%xmm5, %%xmm2\n\t"
+      "pxor %%xmm1, %%xmm1\n\t"
+      "movaps %%xmm5, %%xmm2\n\t"
  
-    "psubw %%xmm6, %%xmm5\n\t"
-    "add $16, %[src0]\n\t"
-    "add $-1, %[bound]\n\t"
+      "psubw %%xmm6, %%xmm5\n\t"
+      "add $16, %[src0]\n\t"
+      "add $-1, %[bound]\n\t"
  
-    "pcmpgtw %%xmm5, %%xmm1\n\t"
-    "add $16, %[src1]\n\t"
+      "pcmpgtw %%xmm5, %%xmm1\n\t"
+      "add $16, %[src1]\n\t"
  
-    "pand %%xmm1, %%xmm6\n\t"
+      "pand %%xmm1, %%xmm6\n\t"
  
-    "pandn %%xmm2, %%xmm1\n\t"
-    "add $16, %[src2]\n\t"
+      "pandn %%xmm2, %%xmm1\n\t"
+      "add $16, %[src2]\n\t"
  
-    "paddw %%xmm6, %%xmm1\n\t"
-    "add $16, %[src3]\n\t"
+      "paddw %%xmm6, %%xmm1\n\t"
+      "add $16, %[src3]\n\t"
  
-    "movaps %%xmm1, (%[target])\n\t"
-    "addw $16, %[target]\n\t"
-    "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t"
+      "movaps %%xmm1, (%[target])\n\t"
+      "addw $16, %[target]\n\t"
+      "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t"
  
-    "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t"
-    :
-    :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target)
-    :
-    );
-  */
+      "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t"
+      :
+      :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2),
+      [src3]"r"(src3), [target]"r"(target)
+      :
+      );
+    */
  
-  short temp0 = 0;
-  short temp1 = 0;
-  for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
-    temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
-    temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
-    target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
-  }
-  return;
+    short temp0 = 0;
+    short temp1 = 0;
+    for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+        temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+        temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i];
+        target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
+    }
+    return;
  }
  
  #endif /*LV_HAVE_SSE2*/
@@ -206,85 +209,91 @@ volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1,
  
  #include <arm_neon.h>
  
-static inline void
-volk_16i_x4_quad_max_star_16i_neon(short* target, short* src0, short* src1,
-                                   short* src2, short* src3, unsigned int num_points)
+static inline void volk_16i_x4_quad_max_star_16i_neon(short* target,
+                                                      short* src0,
+                                                      short* src1,
+                                                      short* src2,
+                                                      short* src3,
+                                                      unsigned int num_points)
  {
-  const unsigned int eighth_points = num_points / 8;
-  unsigned i;
-
-  int16x8_t src0_vec, src1_vec, src2_vec, src3_vec;
-  int16x8_t diff12, diff34;
-  int16x8_t comp0, comp1, comp2, comp3;
-  int16x8_t result1_vec, result2_vec;
-  int16x8_t zeros;
-  zeros = vdupq_n_s16(0);
-  for(i=0; i < eighth_points; ++i) {
-    src0_vec = vld1q_s16(src0);
-    src1_vec = vld1q_s16(src1);
-    src2_vec = vld1q_s16(src2);
-    src3_vec = vld1q_s16(src3);
-    diff12 = vsubq_s16(src0_vec, src1_vec);
-    diff34  = vsubq_s16(src2_vec, src3_vec);
-    comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
-    comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
-    comp2 = (int16x8_t)vcgeq_s16(diff34, zeros);
-    comp3 = (int16x8_t)vcltq_s16(diff34, zeros);
-    comp0 = vandq_s16(src0_vec, comp0);
-    comp1 = vandq_s16(src1_vec, comp1);
-    comp2 = vandq_s16(src2_vec, comp2);
-    comp3 = vandq_s16(src3_vec, comp3);
-
-    result1_vec = vaddq_s16(comp0, comp1);
-    result2_vec = vaddq_s16(comp2, comp3);
-
-    diff12 = vsubq_s16(result1_vec, result2_vec);
-    comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
-    comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
-    comp0 = vandq_s16(result1_vec, comp0);
-    comp1 = vandq_s16(result2_vec, comp1);
-    result1_vec = vaddq_s16(comp0, comp1);
-    vst1q_s16(target, result1_vec);
-    src0 += 8;
-    src1 += 8;
-    src2 += 8;
-    src3 += 8;
-    target += 8;
+    const unsigned int eighth_points = num_points / 8;
+    unsigned i;
+
+    int16x8_t src0_vec, src1_vec, src2_vec, src3_vec;
+    int16x8_t diff12, diff34;
+    int16x8_t comp0, comp1, comp2, comp3;
+    int16x8_t result1_vec, result2_vec;
+    int16x8_t zeros;
+    zeros = vdupq_n_s16(0);
+    for (i = 0; i < eighth_points; ++i) {
+        src0_vec = vld1q_s16(src0);
+        src1_vec = vld1q_s16(src1);
+        src2_vec = vld1q_s16(src2);
+        src3_vec = vld1q_s16(src3);
+        diff12 = vsubq_s16(src0_vec, src1_vec);
+        diff34 = vsubq_s16(src2_vec, src3_vec);
+        comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
+        comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
+        comp2 = (int16x8_t)vcgeq_s16(diff34, zeros);
+        comp3 = (int16x8_t)vcltq_s16(diff34, zeros);
+        comp0 = vandq_s16(src0_vec, comp0);
+        comp1 = vandq_s16(src1_vec, comp1);
+        comp2 = vandq_s16(src2_vec, comp2);
+        comp3 = vandq_s16(src3_vec, comp3);
+
+        result1_vec = vaddq_s16(comp0, comp1);
+        result2_vec = vaddq_s16(comp2, comp3);
+
+        diff12 = vsubq_s16(result1_vec, result2_vec);
+        comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
+        comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
+        comp0 = vandq_s16(result1_vec, comp0);
+        comp1 = vandq_s16(result2_vec, comp1);
+        result1_vec = vaddq_s16(comp0, comp1);
+        vst1q_s16(target, result1_vec);
+        src0 += 8;
+        src1 += 8;
+        src2 += 8;
+        src3 += 8;
+        target += 8;
      }
  
-  short temp0 = 0;
-  short temp1 = 0;
-  for(i=eighth_points*8; i < num_points; ++i) {
-    temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1;
-    temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3;
-    *target++ = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
-    src0++;
-    src1++;
-    src2++;
-    src3++;
-  }
+    short temp0 = 0;
+    short temp1 = 0;
+    for (i = eighth_points * 8; i < num_points; ++i) {
+        temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1;
+        temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3;
+        *target++ = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
+        src0++;
+        src1++;
+        src2++;
+        src3++;
+    }
  }
  #endif /* LV_HAVE_NEON */
  
  
  #ifdef LV_HAVE_GENERIC
-static inline void
-volk_16i_x4_quad_max_star_16i_generic(short* target, short* src0, short* src1,
-                                      short* src2, short* src3, unsigned int num_points)
+static inline void volk_16i_x4_quad_max_star_16i_generic(short* target,
+                                                         short* src0,
+                                                         short* src1,
+                                                         short* src2,
+                                                         short* src3,
+                                                         unsigned int num_points)
  {
-  const unsigned int num_bytes = num_points*2;
+    const unsigned int num_bytes = num_points * 2;
  
-  int i = 0;
+    int i = 0;
  
-  int bound = num_bytes >> 1;
+    int bound = num_bytes >> 1;
  
-  short temp0 = 0;
-  short temp1 = 0;
-  for(i = 0; i < bound; ++i) {
-    temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
-    temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
-    target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
-  }
+    short temp0 = 0;
+    short temp1 = 0;
+    for (i = 0; i < bound; ++i) {
+        temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+        temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i];
+        target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
+    }
  }
  
  #endif /*LV_HAVE_GENERIC*/
diff --git a/kernels/volk/volk_16i_x5_add_quad_16i_x4.h b/kernels/volk/volk_16i_x5_add_quad_16i_x4.h

index 30417dea1b55e872942997c2240c9e538d91aa22..f735f1182c0c41815fd84666dd02b367df33fa36 100644 (file)
--- a/kernels/volk/volk_16i_x5_add_quad_16i_x4.h
+++ b/kernels/volk/volk_16i_x5_add_quad_16i_x4.h
@@ -29,8 +29,9 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_16i_x5_add_quad_16i_x4(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points);
- * \endcode
+ * void volk_16i_x5_add_quad_16i_x4(short* target0, short* target1, short* target2, short*
+ * target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int
+ * num_points); \endcode
   *
   * \b Inputs
   * \li src0: The input vector 0.
@@ -59,182 +60,203 @@
  #ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
  #define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
  
-#include<inttypes.h>
-#include<stdio.h>
+#include <inttypes.h>
+#include <stdio.h>
  
  #ifdef LV_HAVE_SSE2
-#include<xmmintrin.h>
-#include<emmintrin.h>
+#include <emmintrin.h>
+#include <xmmintrin.h>
  
-static inline void
-volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* target1, short* target2, short* target3,
-                                   short* src0, short* src1, short* src2, short* src3, short* src4,
-                                   unsigned int num_points)
+static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0,
+                                                      short* target1,
+                                                      short* target2,
+                                                      short* target3,
+                                                      short* src0,
+                                                      short* src1,
+                                                      short* src2,
+                                                      short* src3,
+                                                      short* src4,
+                                                      unsigned int num_points)
  {
-  const unsigned int num_bytes = num_points*2;
-
-  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
-  __m128i *p_target0, *p_target1, *p_target2, *p_target3,  *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
-  p_target0 = (__m128i*)target0;
-  p_target1 = (__m128i*)target1;
-  p_target2 = (__m128i*)target2;
-  p_target3 = (__m128i*)target3;
-
-  p_src0 = (__m128i*)src0;
-  p_src1 = (__m128i*)src1;
-  p_src2 = (__m128i*)src2;
-  p_src3 = (__m128i*)src3;
-  p_src4 = (__m128i*)src4;
-
-  int i = 0;
-
-  int bound = (num_bytes >> 4);
-  int leftovers = (num_bytes >> 1) & 7;
-
-  for(; i < bound; ++i) {
-    xmm0 = _mm_load_si128(p_src0);
-    xmm1 = _mm_load_si128(p_src1);
-    xmm2 = _mm_load_si128(p_src2);
-    xmm3 = _mm_load_si128(p_src3);
-    xmm4 = _mm_load_si128(p_src4);
-
-    p_src0 += 1;
-    p_src1 += 1;
-
-    xmm1 = _mm_add_epi16(xmm0, xmm1);
-    xmm2 = _mm_add_epi16(xmm0, xmm2);
-    xmm3 = _mm_add_epi16(xmm0, xmm3);
-    xmm4 = _mm_add_epi16(xmm0, xmm4);
-
-
-    p_src2 += 1;
-    p_src3 += 1;
-    p_src4 += 1;
-
-    _mm_store_si128(p_target0, xmm1);
-    _mm_store_si128(p_target1, xmm2);
-    _mm_store_si128(p_target2, xmm3);
-    _mm_store_si128(p_target3, xmm4);
-
-    p_target0 += 1;
-    p_target1 += 1;
-    p_target2 += 1;
-    p_target3 += 1;
-  }
-  /*__VOLK_ASM __VOLK_VOLATILE
-    (
-    ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t"
-    "cmp $0, %[bound]\n\t"
-    "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t"
-    "movaps (%[src0]), %%xmm1\n\t"
-    "movaps (%[src1]), %%xmm2\n\t"
-    "movaps (%[src2]), %%xmm3\n\t"
-    "movaps (%[src3]), %%xmm4\n\t"
-    "movaps (%[src4]), %%xmm5\n\t"
-    "add $16, %[src0]\n\t"
-    "add $16, %[src1]\n\t"
-    "add $16, %[src2]\n\t"
-    "add $16, %[src3]\n\t"
-    "add $16, %[src4]\n\t"
-    "paddw %%xmm1, %%xmm2\n\t"
-    "paddw %%xmm1, %%xmm3\n\t"
-    "paddw %%xmm1, %%xmm4\n\t"
-    "paddw %%xmm1, %%xmm5\n\t"
-    "add $-1, %[bound]\n\t"
-    "movaps %%xmm2, (%[target0])\n\t"
-    "movaps %%xmm3, (%[target1])\n\t"
-    "movaps %%xmm4, (%[target2])\n\t"
-    "movaps %%xmm5, (%[target3])\n\t"
-    "add $16, %[target0]\n\t"
-    "add $16, %[target1]\n\t"
-    "add $16, %[target2]\n\t"
-    "add $16, %[target3]\n\t"
-    "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t"
-    ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t"
-    :
-    :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3)
-    :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-    );
-  */
-
-  for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
-    target0[i] = src0[i] + src1[i];
-    target1[i] = src0[i] + src2[i];
-    target2[i] = src0[i] + src3[i];
-    target3[i] = src0[i] + src4[i];
-  }
+    const unsigned int num_bytes = num_points * 2;
+
+    __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+    __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2,
+        *p_src3, *p_src4;
+    p_target0 = (__m128i*)target0;
+    p_target1 = (__m128i*)target1;
+    p_target2 = (__m128i*)target2;
+    p_target3 = (__m128i*)target3;
+
+    p_src0 = (__m128i*)src0;
+    p_src1 = (__m128i*)src1;
+    p_src2 = (__m128i*)src2;
+    p_src3 = (__m128i*)src3;
+    p_src4 = (__m128i*)src4;
+
+    int i = 0;
+
+    int bound = (num_bytes >> 4);
+    int leftovers = (num_bytes >> 1) & 7;
+
+    for (; i < bound; ++i) {
+        xmm0 = _mm_load_si128(p_src0);
+        xmm1 = _mm_load_si128(p_src1);
+        xmm2 = _mm_load_si128(p_src2);
+        xmm3 = _mm_load_si128(p_src3);
+        xmm4 = _mm_load_si128(p_src4);
+
+        p_src0 += 1;
+        p_src1 += 1;
+
+        xmm1 = _mm_add_epi16(xmm0, xmm1);
+        xmm2 = _mm_add_epi16(xmm0, xmm2);
+        xmm3 = _mm_add_epi16(xmm0, xmm3);
+        xmm4 = _mm_add_epi16(xmm0, xmm4);
+
+
+        p_src2 += 1;
+        p_src3 += 1;
+        p_src4 += 1;
+
+        _mm_store_si128(p_target0, xmm1);
+        _mm_store_si128(p_target1, xmm2);
+        _mm_store_si128(p_target2, xmm3);
+        _mm_store_si128(p_target3, xmm4);
+
+        p_target0 += 1;
+        p_target1 += 1;
+        p_target2 += 1;
+        p_target3 += 1;
+    }
+    /*__VOLK_ASM __VOLK_VOLATILE
+      (
+      ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t"
+      "cmp $0, %[bound]\n\t"
+      "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t"
+      "movaps (%[src0]), %%xmm1\n\t"
+      "movaps (%[src1]), %%xmm2\n\t"
+      "movaps (%[src2]), %%xmm3\n\t"
+      "movaps (%[src3]), %%xmm4\n\t"
+      "movaps (%[src4]), %%xmm5\n\t"
+      "add $16, %[src0]\n\t"
+      "add $16, %[src1]\n\t"
+      "add $16, %[src2]\n\t"
+      "add $16, %[src3]\n\t"
+      "add $16, %[src4]\n\t"
+      "paddw %%xmm1, %%xmm2\n\t"
+      "paddw %%xmm1, %%xmm3\n\t"
+      "paddw %%xmm1, %%xmm4\n\t"
+      "paddw %%xmm1, %%xmm5\n\t"
+      "add $-1, %[bound]\n\t"
+      "movaps %%xmm2, (%[target0])\n\t"
+      "movaps %%xmm3, (%[target1])\n\t"
+      "movaps %%xmm4, (%[target2])\n\t"
+      "movaps %%xmm5, (%[target3])\n\t"
+      "add $16, %[target0]\n\t"
+      "add $16, %[target1]\n\t"
+      "add $16, %[target2]\n\t"
+      "add $16, %[target3]\n\t"
+      "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t"
+      ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t"
+      :
+      :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2),
+      [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1),
+      [target2]"r"(target2), [target3]"r"(target3)
+      :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+      );
+    */
+
+    for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+        target0[i] = src0[i] + src1[i];
+        target1[i] = src0[i] + src2[i];
+        target2[i] = src0[i] + src3[i];
+        target3[i] = src0[i] + src4[i];
+    }
  }
  #endif /*LV_HAVE_SSE2*/
  
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_16i_x5_add_quad_16i_x4_neon(short* target0, short* target1, short* target2, short* target3,
-                                 short* src0, short* src1, short* src2, short* src3, short* src4,
-                                 unsigned int num_points)
+static inline void volk_16i_x5_add_quad_16i_x4_neon(short* target0,
+                                                    short* target1,
+                                                    short* target2,
+                                                    short* target3,
+                                                    short* src0,
+                                                    short* src1,
+                                                    short* src2,
+                                                    short* src3,
+                                                    short* src4,
+                                                    unsigned int num_points)
  {
-  const unsigned int eighth_points = num_points / 8;
-  unsigned int number = 0;
-
-  int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec;
-  int16x8_t target0_vec, target1_vec, target2_vec, target3_vec;
-  for(number = 0; number < eighth_points; ++number) {
-    src0_vec = vld1q_s16(src0);
-    src1_vec = vld1q_s16(src1);
-    src2_vec = vld1q_s16(src2);
-    src3_vec = vld1q_s16(src3);
-    src4_vec = vld1q_s16(src4);
-
-    target0_vec = vaddq_s16(src0_vec , src1_vec);
-    target1_vec = vaddq_s16(src0_vec , src2_vec);
-    target2_vec = vaddq_s16(src0_vec , src3_vec);
-    target3_vec = vaddq_s16(src0_vec , src4_vec);
-
-    vst1q_s16(target0, target0_vec);
-    vst1q_s16(target1, target1_vec);
-    vst1q_s16(target2, target2_vec);
-    vst1q_s16(target3, target3_vec);
-    src0 += 8;
-    src1 += 8;
-    src2 += 8;
-    src3 += 8;
-    src4 += 8;
-    target0 += 8;
-    target1 += 8;
-    target2 += 8;
-    target3 += 8;
-  }
-
-  for(number = eighth_points * 8; number < num_points; ++number) {
-    *target0++ = *src0 + *src1++;
-    *target1++ = *src0 + *src2++;
-    *target2++ = *src0 + *src3++;
-    *target3++ = *src0++ + *src4++;
-  }
+    const unsigned int eighth_points = num_points / 8;
+    unsigned int number = 0;
+
+    int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec;
+    int16x8_t target0_vec, target1_vec, target2_vec, target3_vec;
+    for (number = 0; number < eighth_points; ++number) {
+        src0_vec = vld1q_s16(src0);
+        src1_vec = vld1q_s16(src1);
+        src2_vec = vld1q_s16(src2);
+        src3_vec = vld1q_s16(src3);
+        src4_vec = vld1q_s16(src4);
+
+        target0_vec = vaddq_s16(src0_vec, src1_vec);
+        target1_vec = vaddq_s16(src0_vec, src2_vec);
+        target2_vec = vaddq_s16(src0_vec, src3_vec);
+        target3_vec = vaddq_s16(src0_vec, src4_vec);
+
+        vst1q_s16(target0, target0_vec);
+        vst1q_s16(target1, target1_vec);
+        vst1q_s16(target2, target2_vec);
+        vst1q_s16(target3, target3_vec);
+        src0 += 8;
+        src1 += 8;
+        src2 += 8;
+        src3 += 8;
+        src4 += 8;
+        target0 += 8;
+        target1 += 8;
+        target2 += 8;
+        target3 += 8;
+    }
+
+    for (number = eighth_points * 8; number < num_points; ++number) {
+        *target0++ = *src0 + *src1++;
+        *target1++ = *src0 + *src2++;
+        *target2++ = *src0 + *src3++;
+        *target3++ = *src0++ + *src4++;
+    }
  }
  
  #endif /* LV_HAVE_NEON */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_16i_x5_add_quad_16i_x4_generic(short* target0, short* target1, short* target2, short* target3,
-                                    short* src0, short* src1, short* src2, short* src3, short* src4,
-                                    unsigned int num_points)
+static inline void volk_16i_x5_add_quad_16i_x4_generic(short* target0,
+                                                       short* target1,
+                                                       short* target2,
+                                                       short* target3,
+                                                       short* src0,
+                                                       short* src1,
+                                                       short* src2,
+                                                       short* src3,
+                                                       short* src4,
+                                                       unsigned int num_points)
  {
-  const unsigned int num_bytes = num_points*2;
+    const unsigned int num_bytes = num_points * 2;
  
-  int i = 0;
+    int i = 0;
  
-  int bound = num_bytes >> 1;
+    int bound = num_bytes >> 1;
  
-  for(i = 0; i < bound; ++i) {
-    target0[i] = src0[i] + src1[i];
-    target1[i] = src0[i] + src2[i];
-    target2[i] = src0[i] + src3[i];
-    target3[i] = src0[i] + src4[i];
-  }
+    for (i = 0; i < bound; ++i) {
+        target0[i] = src0[i] + src1[i];
+        target1[i] = src0[i] + src2[i];
+        target2[i] = src0[i] + src3[i];
+        target3[i] = src0[i] + src4[i];
+    }
  }
  
  #endif /* LV_HAVE_GENERIC */
diff --git a/kernels/volk/volk_16ic_convert_32fc.h b/kernels/volk/volk_16ic_convert_32fc.h

index 84f067c2d5b2da4bf127cc99fbb16a2da9a6116c..145372464e0a8cb87a2d12b0344202233f8a90c2 100644 (file)
--- a/kernels/volk/volk_16ic_convert_32fc.h
+++ b/kernels/volk/volk_16ic_convert_32fc.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_16ic_convert_32fc(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
- * \endcode
+ * void volk_16ic_convert_32fc(lv_32fc_t* outputVector, const lv_16sc_t* inputVector,
+ * unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li inputVector:  The complex 16-bit integer input data buffer.
@@ -51,7 +51,9 @@
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
+static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector,
+                                                 const lv_16sc_t* inputVector,
+                                                 unsigned int num_points)
  {
      const unsigned int avx_iters = num_points / 8;
      unsigned int number = 0;
@@ -61,36 +63,36 @@ static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector, const
      __m256i outValInt;
      __m128i cplxValue;
  
-    for(number = 0; number < avx_iters; number++)
-        {
-            cplxValue = _mm_load_si128((__m128i*)complexVectorPtr);
-            complexVectorPtr += 8;
-            
-            outValInt = _mm256_cvtepi16_epi32(cplxValue);
-            outVal = _mm256_cvtepi32_ps(outValInt);
-            _mm256_store_ps((float*)outputVectorPtr, outVal);
+    for (number = 0; number < avx_iters; number++) {
+        cplxValue = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 8;
  
-            outputVectorPtr += 8;
-        }
+        outValInt = _mm256_cvtepi16_epi32(cplxValue);
+        outVal = _mm256_cvtepi32_ps(outValInt);
+        _mm256_store_ps((float*)outputVectorPtr, outVal);
+
+        outputVectorPtr += 8;
+    }
  
      number = avx_iters * 8;
-    for(; number < num_points*2; number++)
-        {
-            *outputVectorPtr++ = (float)*complexVectorPtr++;
-        }
+    for (; number < num_points * 2; number++) {
+        *outputVectorPtr++ = (float)*complexVectorPtr++;
+    }
  }
  
  #endif /* LV_HAVE_AVX2 */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
+static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector,
+                                                  const lv_16sc_t* inputVector,
+                                                  unsigned int num_points)
  {
      unsigned int i;
-    for(i = 0; i < num_points; i++)
-        {
-            outputVector[i] = lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i]));
-        }
+    for (i = 0; i < num_points; i++) {
+        outputVector[i] =
+            lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i]));
+    }
  }
  
  #endif /* LV_HAVE_GENERIC */
@@ -99,7 +101,9 @@ static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector, const
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
+static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector,
+                                                 const lv_16sc_t* inputVector,
+                                                 unsigned int num_points)
  {
      const unsigned int sse_iters = num_points / 2;
  
@@ -108,18 +112,21 @@ static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector, const
      __m128 a;
      unsigned int number;
  
-    for(number = 0; number < sse_iters; number++)
-        {
-            a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
-            _mm_store_ps((float*)_out, a);
-            _in += 2;
-            _out += 2;
-        }
-    if (num_points & 1)
-        {
-            *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
-            _in++;
-        }
+    for (number = 0; number < sse_iters; number++) {
+        a = _mm_set_ps(
+            (float)(lv_cimag(_in[1])),
+            (float)(lv_creal(_in[1])),
+            (float)(lv_cimag(_in[0])),
+            (float)(lv_creal(
+                _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+        _mm_store_ps((float*)_out, a);
+        _in += 2;
+        _out += 2;
+    }
+    if (num_points & 1) {
+        *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+        _in++;
+    }
  }
  
  #endif /* LV_HAVE_SSE2 */
@@ -127,7 +134,9 @@ static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector, const
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
+static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector,
+                                                const lv_16sc_t* inputVector,
+                                                unsigned int num_points)
  {
      const unsigned int sse_iters = num_points / 4;
  
@@ -136,19 +145,26 @@ static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector, const l
      __m256 a;
      unsigned int i, number;
  
-    for(number = 0; number < sse_iters; number++)
-        {
-            a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
-            _mm256_store_ps((float*)_out, a);
-            _in += 4;
-            _out += 4;
-        }
+    for (number = 0; number < sse_iters; number++) {
+        a = _mm256_set_ps(
+            (float)(lv_cimag(_in[3])),
+            (float)(lv_creal(_in[3])),
+            (float)(lv_cimag(_in[2])),
+            (float)(lv_creal(_in[2])),
+            (float)(lv_cimag(_in[1])),
+            (float)(lv_creal(_in[1])),
+            (float)(lv_cimag(_in[0])),
+            (float)(lv_creal(
+                _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+        _mm256_store_ps((float*)_out, a);
+        _in += 4;
+        _out += 4;
+    }
      _mm256_zeroupper();
-    for (i = 0; i < (num_points % 4); ++i)
-        {
-            *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
-            _in++;
-        }
+    for (i = 0; i < (num_points % 4); ++i) {
+        *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+        _in++;
+    }
  }
  
  #endif /* LV_HAVE_AVX */
@@ -157,7 +173,9 @@ static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector, const l
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
+static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector,
+                                               const lv_16sc_t* inputVector,
+                                               unsigned int num_points)
  {
      const unsigned int sse_iters = num_points / 2;
  
@@ -169,21 +187,19 @@ static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv
      float32x4_t f32x4;
      unsigned int i, number;
  
-    for(number = 0; number < sse_iters; number++)
-        {
-            a16x4 = vld1_s16((const int16_t*)_in);
-            __VOLK_PREFETCH(_in + 4);
-            a32x4 = vmovl_s16(a16x4);
-            f32x4 = vcvtq_f32_s32(a32x4);
-            vst1q_f32((float32_t*)_out, f32x4);
-            _in += 2;
-            _out += 2;
-        }
-    for (i = 0; i < (num_points % 2); ++i)
-        {
-            *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
-            _in++;
-        }
+    for (number = 0; number < sse_iters; number++) {
+        a16x4 = vld1_s16((const int16_t*)_in);
+        __VOLK_PREFETCH(_in + 4);
+        a32x4 = vmovl_s16(a16x4);
+        f32x4 = vcvtq_f32_s32(a32x4);
+        vst1q_f32((float32_t*)_out, f32x4);
+        _in += 2;
+        _out += 2;
+    }
+    for (i = 0; i < (num_points % 2); ++i) {
+        *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+        _in++;
+    }
  }
  #endif /* LV_HAVE_NEON */
  
@@ -198,7 +214,9 @@ static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
+static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector,
+                                                 const lv_16sc_t* inputVector,
+                                                 unsigned int num_points)
  {
      const unsigned int avx_iters = num_points / 8;
      unsigned int number = 0;
@@ -208,23 +226,21 @@ static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector, const
      __m256i outValInt;
      __m128i cplxValue;
  
-    for(number = 0; number < avx_iters; number++)
-        {
-            cplxValue = _mm_loadu_si128((__m128i*)complexVectorPtr);
-            complexVectorPtr += 8;
-            
-            outValInt = _mm256_cvtepi16_epi32(cplxValue);
-            outVal = _mm256_cvtepi32_ps(outValInt);
-            _mm256_storeu_ps((float*)outputVectorPtr, outVal);
+    for (number = 0; number < avx_iters; number++) {
+        cplxValue = _mm_loadu_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 8;
+
+        outValInt = _mm256_cvtepi16_epi32(cplxValue);
+        outVal = _mm256_cvtepi32_ps(outValInt);
+        _mm256_storeu_ps((float*)outputVectorPtr, outVal);
  
-            outputVectorPtr += 8;
-        }
+        outputVectorPtr += 8;
+    }
  
      number = avx_iters * 8;
-    for(; number < num_points*2; number++)
-        {
-            *outputVectorPtr++ = (float)*complexVectorPtr++;
-        }
+    for (; number < num_points * 2; number++) {
+        *outputVectorPtr++ = (float)*complexVectorPtr++;
+    }
  }
  
  #endif /* LV_HAVE_AVX2 */
@@ -232,7 +248,9 @@ static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector, const
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
+static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector,
+                                                 const lv_16sc_t* inputVector,
+                                                 unsigned int num_points)
  {
      const unsigned int sse_iters = num_points / 2;
  
@@ -241,18 +259,21 @@ static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector, const
      __m128 a;
      unsigned int number;
  
-    for(number = 0; number < sse_iters; number++)
-        {
-            a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
-            _mm_storeu_ps((float*)_out, a);
-            _in += 2;
-            _out += 2;
-        }
-    if (num_points & 1)
-        {
-            *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
-            _in++;
-        }
+    for (number = 0; number < sse_iters; number++) {
+        a = _mm_set_ps(
+            (float)(lv_cimag(_in[1])),
+            (float)(lv_creal(_in[1])),
+            (float)(lv_cimag(_in[0])),
+            (float)(lv_creal(
+                _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+        _mm_storeu_ps((float*)_out, a);
+        _in += 2;
+        _out += 2;
+    }
+    if (num_points & 1) {
+        *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+        _in++;
+    }
  }
  
  #endif /* LV_HAVE_SSE2 */
@@ -261,7 +282,9 @@ static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector, const
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
+static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector,
+                                                const lv_16sc_t* inputVector,
+                                                unsigned int num_points)
  {
      const unsigned int sse_iters = num_points / 4;
  
@@ -270,21 +293,27 @@ static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector, const l
      __m256 a;
      unsigned int i, number;
  
-    for(number = 0; number < sse_iters; number++)
-        {
-            a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
-            _mm256_storeu_ps((float*)_out, a);
-            _in += 4;
-            _out += 4;
-        }
+    for (number = 0; number < sse_iters; number++) {
+        a = _mm256_set_ps(
+            (float)(lv_cimag(_in[3])),
+            (float)(lv_creal(_in[3])),
+            (float)(lv_cimag(_in[2])),
+            (float)(lv_creal(_in[2])),
+            (float)(lv_cimag(_in[1])),
+            (float)(lv_creal(_in[1])),
+            (float)(lv_cimag(_in[0])),
+            (float)(lv_creal(
+                _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+        _mm256_storeu_ps((float*)_out, a);
+        _in += 4;
+        _out += 4;
+    }
      _mm256_zeroupper();
-    for (i = 0; i < (num_points % 4); ++i)
-        {
-            *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
-            _in++;
-        }
+    for (i = 0; i < (num_points % 4); ++i) {
+        *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+        _in++;
+    }
  }
  
  #endif /* LV_HAVE_AVX */
  #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
-
diff --git a/kernels/volk/volk_16ic_deinterleave_16i_x2.h b/kernels/volk/volk_16ic_deinterleave_16i_x2.h

index 40d10b4f1ed0ff40aa1406bbe93332389d3648c8..9e784a66897542ce8a4efbfadd9fa1b6740c11dd 100644 (file)
--- a/kernels/volk/volk_16ic_deinterleave_16i_x2.h
+++ b/kernels/volk/volk_16ic_deinterleave_16i_x2.h
@@ -29,8 +29,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_16ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_16ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t*
+ * complexVector, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li complexVector: The complex input vector.
@@ -59,179 +59,241 @@
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_16ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer,
+                                                        int16_t* qBuffer,
+                                                        const lv_16sc_t* complexVector,
+                                                        unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-
-  __m256i MoveMask = _mm256_set_epi8(15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0, 15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0);
-
-  __m256i iMove2, iMove1;
-  __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
-
-  unsigned int sixteenthPoints = num_points / 16;
-
-  for(number = 0; number < sixteenthPoints; number++){
-    complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);  complexVectorPtr += 32;
-    complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);  complexVectorPtr += 32;
-
-    iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
-    iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
-
-    iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x08),_mm256_permute4x64_epi64(iMove2,0x80),0x30);
-    qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x0d),_mm256_permute4x64_epi64(iMove2,0xd0),0x30);
-
-    _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
-    _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
-
-    iBufferPtr += 16;
-    qBufferPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;
-  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *int16ComplexVectorPtr++;
-    *qBufferPtr++ = *int16ComplexVectorPtr++;
-  }
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    int16_t* qBufferPtr = qBuffer;
+
+    __m256i MoveMask = _mm256_set_epi8(15,
+                                       14,
+                                       11,
+                                       10,
+                                       7,
+                                       6,
+                                       3,
+                                       2,
+                                       13,
+                                       12,
+                                       9,
+                                       8,
+                                       5,
+                                       4,
+                                       1,
+                                       0,
+                                       15,
+                                       14,
+                                       11,
+                                       10,
+                                       7,
+                                       6,
+                                       3,
+                                       2,
+                                       13,
+                                       12,
+                                       9,
+                                       8,
+                                       5,
+                                       4,
+                                       1,
+                                       0);
+
+    __m256i iMove2, iMove1;
+    __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
+
+    unsigned int sixteenthPoints = num_points / 16;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+        complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
+        iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
+
+        iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08),
+                                               _mm256_permute4x64_epi64(iMove2, 0x80),
+                                               0x30);
+        qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d),
+                                               _mm256_permute4x64_epi64(iMove2, 0xd0),
+                                               0x30);
+
+        _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
+        _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
+
+        iBufferPtr += 16;
+        qBufferPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *int16ComplexVectorPtr++;
+        *qBufferPtr++ = *int16ComplexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
  #ifdef LV_HAVE_SSSE3
  #include <tmmintrin.h>
  
-static inline void
-volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer,
+                                                         int16_t* qBuffer,
+                                                         const lv_16sc_t* complexVector,
+                                                         unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-
-  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
-  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-
-  __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
-  __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-
-  __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
-
-  unsigned int eighthPoints = num_points / 8;
-
-  for(number = 0; number < eighthPoints; number++){
-    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-
-    iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2));
-    qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2));
-
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
-    _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
-
-    iBufferPtr += 8;
-    qBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *int16ComplexVectorPtr++;
-    *qBufferPtr++ = *int16ComplexVectorPtr++;
-  }
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    int16_t* qBufferPtr = qBuffer;
+
+    __m128i iMoveMask1 = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+    __m128i iMoveMask2 = _mm_set_epi8(
+        13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+    __m128i qMoveMask1 = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
+    __m128i qMoveMask2 = _mm_set_epi8(
+        15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+    __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
+
+    unsigned int eighthPoints = num_points / 8;
+
+    for (number = 0; number < eighthPoints; number++) {
+        complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16;
+
+        iOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, iMoveMask1),
+                                  _mm_shuffle_epi8(complexVal2, iMoveMask2));
+        qOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, qMoveMask1),
+                                  _mm_shuffle_epi8(complexVal2, qMoveMask2));
+
+        _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+        _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+        iBufferPtr += 8;
+        qBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *int16ComplexVectorPtr++;
+        *qBufferPtr++ = *int16ComplexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_SSSE3 */
  
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void
-volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer,
+                                                        int16_t* qBuffer,
+                                                        const lv_16sc_t* complexVector,
+                                                        unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (int16_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-  __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal;
-  __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
-  __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+    unsigned int number = 0;
+    const int16_t* complexVectorPtr = (int16_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    int16_t* qBufferPtr = qBuffer;
+    __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1,
+        qComplexVal2, iOutputVal, qOutputVal;
+    __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+    __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
  
-  unsigned int eighthPoints = num_points / 8;
+    unsigned int eighthPoints = num_points / 8;
  
-  for(number = 0; number < eighthPoints; number++){
-    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
-    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+    for (number = 0; number < eighthPoints; number++) {
+        complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 8;
+        complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 8;
  
-    iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+        iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
  
-    iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
+        iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
  
-    iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
+        iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
  
-    iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+        iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
  
-    iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0));
+        iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3, 1, 2, 0));
  
-    iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1));
+        iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
  
-    iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask));
+        iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask),
+                                  _mm_and_si128(iComplexVal2, highMask));
  
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+        _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
  
-    qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1));
+        qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2, 0, 3, 1));
  
-    qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1));
+        qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2, 0, 3, 1));
  
-    qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0));
+        qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
  
-    qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1));
+        qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2, 0, 3, 1));
  
-    qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
+        qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
  
-    qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
+        qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
  
-    qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask));
+        qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask),
+                                  _mm_and_si128(qComplexVal2, highMask));
  
-    _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+        _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
  
-    iBufferPtr += 8;
-    qBufferPtr += 8;
-  }
+        iBufferPtr += 8;
+        qBufferPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_SSE2 */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer,
+                                                         int16_t* qBuffer,
+                                                         const lv_16sc_t* complexVector,
+                                                         unsigned int num_points)
  {
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-  unsigned int number;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    int16_t* qBufferPtr = qBuffer;
+    unsigned int number;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  #ifdef LV_HAVE_ORC
  
-extern void
-volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
-static inline void
-volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+extern void volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer,
+                                                     int16_t* qBuffer,
+                                                     const lv_16sc_t* complexVector,
+                                                     unsigned int num_points);
+static inline void volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer,
+                                                       int16_t* qBuffer,
+                                                       const lv_16sc_t* complexVector,
+                                                       unsigned int num_points)
  {
-  volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points);
+    volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points);
  }
  #endif /* LV_HAVE_ORC */
  
@@ -246,44 +308,83 @@ volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer, int16_t* qBuffer, const lv
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
+                                                        int16_t* qBuffer,
+                                                        const lv_16sc_t* complexVector,
+                                                        unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-
-  __m256i MoveMask = _mm256_set_epi8(15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0, 15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0);
-
-  __m256i iMove2, iMove1;
-  __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
-
-  unsigned int sixteenthPoints = num_points / 16;
-
-  for(number = 0; number < sixteenthPoints; number++){
-    complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);  complexVectorPtr += 32;
-    complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);  complexVectorPtr += 32;
-
-    iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
-    iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
-
-    iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x08),_mm256_permute4x64_epi64(iMove2,0x80),0x30);
-    qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x0d),_mm256_permute4x64_epi64(iMove2,0xd0),0x30);
-
-    _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
-    _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
-
-    iBufferPtr += 16;
-    qBufferPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;
-  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *int16ComplexVectorPtr++;
-    *qBufferPtr++ = *int16ComplexVectorPtr++;
-  }
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    int16_t* qBufferPtr = qBuffer;
+
+    __m256i MoveMask = _mm256_set_epi8(15,
+                                       14,
+                                       11,
+                                       10,
+                                       7,
+                                       6,
+                                       3,
+                                       2,
+                                       13,
+                                       12,
+                                       9,
+                                       8,
+                                       5,
+                                       4,
+                                       1,
+                                       0,
+                                       15,
+                                       14,
+                                       11,
+                                       10,
+                                       7,
+                                       6,
+                                       3,
+                                       2,
+                                       13,
+                                       12,
+                                       9,
+                                       8,
+                                       5,
+                                       4,
+                                       1,
+                                       0);
+
+    __m256i iMove2, iMove1;
+    __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
+
+    unsigned int sixteenthPoints = num_points / 16;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+        complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
+        iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
+
+        iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08),
+                                               _mm256_permute4x64_epi64(iMove2, 0x80),
+                                               0x30);
+        qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d),
+                                               _mm256_permute4x64_epi64(iMove2, 0xd0),
+                                               0x30);
+
+        _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
+        _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
+
+        iBufferPtr += 16;
+        qBufferPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *int16ComplexVectorPtr++;
+        *qBufferPtr++ = *int16ComplexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
diff --git a/kernels/volk/volk_16ic_deinterleave_real_16i.h b/kernels/volk/volk_16ic_deinterleave_real_16i.h

index c1de553141383e4a303303c415466369bcf68531..45fcd9925d95b7c72459b76d952769d1d8747411 100644 (file)
--- a/kernels/volk/volk_16ic_deinterleave_real_16i.h
+++ b/kernels/volk/volk_16ic_deinterleave_real_16i.h
@@ -25,12 +25,13 @@
   *
   * \b Overview
   *
- * Deinterleaves the complex 16 bit vector and returns the real (inphase) part of the signal.
+ * Deinterleaves the complex 16 bit vector and returns the real (inphase) part of the
+ * signal.
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_16ic_deinterleave_real_16i(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_16ic_deinterleave_real_16i(int16_t* iBuffer, const lv_16sc_t* complexVector,
+ * unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li complexVector: The complex input vector.
@@ -60,79 +61,149 @@
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_16ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer,
+                                                          const lv_16sc_t* complexVector,
+                                                          unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (int16_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-
-  __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
-  __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-
-  __m256i complexVal1, complexVal2, iOutputVal;
-
-  unsigned int sixteenthPoints = num_points / 16;
-
-  for(number = 0; number < sixteenthPoints; number++){
-    complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);  complexVectorPtr += 16;
-    complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);  complexVectorPtr += 16;
-
-    complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
-    complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
-
-    iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
-    iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
-
-    _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
-
-    iBufferPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
+    unsigned int number = 0;
+    const int16_t* complexVectorPtr = (int16_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+
+    __m256i iMoveMask1 = _mm256_set_epi8(0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0);
+    __m256i iMoveMask2 = _mm256_set_epi8(13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80);
+
+    __m256i complexVal1, complexVal2, iOutputVal;
+
+    unsigned int sixteenthPoints = num_points / 16;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+        complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 16;
+
+        complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
+        complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
+
+        iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
+        iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
+
+        _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
+
+        iBufferPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
  #ifdef LV_HAVE_SSSE3
  #include <tmmintrin.h>
  
-static inline void
-volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer,
+                                                           const lv_16sc_t* complexVector,
+                                                           unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (int16_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
+    unsigned int number = 0;
+    const int16_t* complexVectorPtr = (int16_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
  
-  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
-  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+    __m128i iMoveMask1 = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+    __m128i iMoveMask2 = _mm_set_epi8(
+        13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
  
-  __m128i complexVal1, complexVal2, iOutputVal;
+    __m128i complexVal1, complexVal2, iOutputVal;
  
-  unsigned int eighthPoints = num_points / 8;
+    unsigned int eighthPoints = num_points / 8;
  
-  for(number = 0; number < eighthPoints; number++){
-    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
-    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+    for (number = 0; number < eighthPoints; number++) {
+        complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 8;
+        complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 8;
  
-    complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
-    complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+        complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+        complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
  
-    iOutputVal = _mm_or_si128(complexVal1, complexVal2);
+        iOutputVal = _mm_or_si128(complexVal1, complexVal2);
  
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+        _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
  
-    iBufferPtr += 8;
-  }
+        iBufferPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_SSSE3 */
  
@@ -140,61 +211,66 @@ volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer, const lv_16sc_t* compl
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void
-volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer,
+                                                          const lv_16sc_t* complexVector,
+                                                          unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (int16_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  __m128i complexVal1, complexVal2, iOutputVal;
-  __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
-  __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+    unsigned int number = 0;
+    const int16_t* complexVectorPtr = (int16_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    __m128i complexVal1, complexVal2, iOutputVal;
+    __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+    __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
  
-  unsigned int eighthPoints = num_points / 8;
+    unsigned int eighthPoints = num_points / 8;
  
-  for(number = 0; number < eighthPoints; number++){
-    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
-    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+    for (number = 0; number < eighthPoints; number++) {
+        complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 8;
+        complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 8;
  
-    complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+        complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
  
-    complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+        complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
  
-    complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0));
+        complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
  
-    complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+        complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
  
-    complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+        complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
  
-    complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1));
+        complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2, 0, 3, 1));
  
-    iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask));
+        iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask),
+                                  _mm_and_si128(complexVal2, highMask));
  
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+        _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
  
-    iBufferPtr += 8;
-  }
+        iBufferPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_SSE2 */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer,
+                                                           const lv_16sc_t* complexVector,
+                                                           unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (int16_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
+    unsigned int number = 0;
+    const int16_t* complexVectorPtr = (int16_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -212,40 +288,105 @@ volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_16sc_t* compl
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_16ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
+                                                          const lv_16sc_t* complexVector,
+                                                          unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (int16_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-
-  __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
-  __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-
-  __m256i complexVal1, complexVal2, iOutputVal;
-
-  unsigned int sixteenthPoints = num_points / 16;
-
-  for(number = 0; number < sixteenthPoints; number++){
-    complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);  complexVectorPtr += 16;
-    complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);  complexVectorPtr += 16;
-
-    complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
-    complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
-
-    iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
-    iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
-
-    _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
-
-    iBufferPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
+    unsigned int number = 0;
+    const int16_t* complexVectorPtr = (int16_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+
+    __m256i iMoveMask1 = _mm256_set_epi8(0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0);
+    __m256i iMoveMask2 = _mm256_set_epi8(13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80);
+
+    __m256i complexVal1, complexVal2, iOutputVal;
+
+    unsigned int sixteenthPoints = num_points / 16;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+        complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 16;
+
+        complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
+        complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
+
+        iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
+        iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
+
+        _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
+
+        iBufferPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
diff --git a/kernels/volk/volk_16ic_deinterleave_real_8i.h b/kernels/volk/volk_16ic_deinterleave_real_8i.h

index 1022688b847486ac2f6ae78c4cb3de951fa30ecc..3d8e4eabd2eaf8489f6bd25445c91c94e1869f82 100644 (file)
--- a/kernels/volk/volk_16ic_deinterleave_real_8i.h
+++ b/kernels/volk/volk_16ic_deinterleave_real_8i.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_16ic_deinterleave_real_8i(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_16ic_deinterleave_real_8i(int8_t* iBuffer, const lv_16sc_t* complexVector,
+ * unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li complexVector: The complex input vector.
@@ -61,54 +61,121 @@
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer,
+                                                         const lv_16sc_t* complexVector,
+                                                         unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int8_t* iBufferPtr = iBuffer;
-  __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
-  __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-  __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
-
-  unsigned int thirtysecondPoints = num_points / 32;
-
-  for(number = 0; number < thirtysecondPoints; number++){
-    complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);  complexVectorPtr += 32;
-    complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);  complexVectorPtr += 32;
-
-    complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr);  complexVectorPtr += 32;
-    complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr);  complexVectorPtr += 32;
-
-    complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
-    complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
-
-    complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
-    complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
-
-    complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
-    complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
-
-    complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
-    complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
-
-    complexVal1 = _mm256_srai_epi16(complexVal1, 8);
-    complexVal3 = _mm256_srai_epi16(complexVal3, 8);
-
-    iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
-    iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
-
-    _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
-
-    iBufferPtr += 32;
-  }
-
-  number = thirtysecondPoints * 32;
-  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
-    int16ComplexVectorPtr++;
-  }
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int8_t* iBufferPtr = iBuffer;
+    __m256i iMoveMask1 = _mm256_set_epi8(0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0);
+    __m256i iMoveMask2 = _mm256_set_epi8(13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80);
+    __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
+
+    unsigned int thirtysecondPoints = num_points / 32;
+
+    for (number = 0; number < thirtysecondPoints; number++) {
+        complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
+        complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
+
+        complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
+        complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
+
+        complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
+        complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
+
+        complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
+        complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
+
+        complexVal1 = _mm256_srai_epi16(complexVal1, 8);
+        complexVal3 = _mm256_srai_epi16(complexVal3, 8);
+
+        iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
+        iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
+
+        _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
+
+        iBufferPtr += 32;
+    }
+
+    number = thirtysecondPoints * 32;
+    int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
+        int16ComplexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -116,105 +183,116 @@ volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, const lv_16sc_t* complexV
  #ifdef LV_HAVE_SSSE3
  #include <tmmintrin.h>
  
-static inline void
-volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer,
+                                                          const lv_16sc_t* complexVector,
+                                                          unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int8_t* iBufferPtr = iBuffer;
-  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
-  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-  __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int8_t* iBufferPtr = iBuffer;
+    __m128i iMoveMask1 = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+    __m128i iMoveMask2 = _mm_set_epi8(
+        13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+    __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
  
-  unsigned int sixteenthPoints = num_points / 16;
+    unsigned int sixteenthPoints = num_points / 16;
  
-  for(number = 0; number < sixteenthPoints; number++){
-    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+    for (number = 0; number < sixteenthPoints; number++) {
+        complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16;
  
-    complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-    complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+        complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16;
  
-    complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
-    complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+        complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+        complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
  
-    complexVal1 = _mm_or_si128(complexVal1, complexVal2);
+        complexVal1 = _mm_or_si128(complexVal1, complexVal2);
  
-    complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
-    complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
+        complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
+        complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
  
-    complexVal3 = _mm_or_si128(complexVal3, complexVal4);
+        complexVal3 = _mm_or_si128(complexVal3, complexVal4);
  
  
-    complexVal1 = _mm_srai_epi16(complexVal1, 8);
-    complexVal3 = _mm_srai_epi16(complexVal3, 8);
+        complexVal1 = _mm_srai_epi16(complexVal1, 8);
+        complexVal3 = _mm_srai_epi16(complexVal3, 8);
  
-    iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
+        iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
  
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+        _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
  
-    iBufferPtr += 16;
-  }
+        iBufferPtr += 16;
+    }
  
-  number = sixteenthPoints * 16;
-  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
-    int16ComplexVectorPtr++;
-  }
+    number = sixteenthPoints * 16;
+    int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
+        int16ComplexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_SSSE3 */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer,
+                                                          const lv_16sc_t* complexVector,
+                                                          unsigned int num_points)
  {
-  unsigned int number = 0;
-  int16_t* complexVectorPtr = (int16_t*)complexVector;
-  int8_t* iBufferPtr = iBuffer;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
-    complexVectorPtr++;
-  }
+    unsigned int number = 0;
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+    int8_t* iBufferPtr = iBuffer;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_16ic_deinterleave_real_8i_neon(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_real_8i_neon(int8_t* iBuffer,
+                                                       const lv_16sc_t* complexVector,
+                                                       unsigned int num_points)
  {
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  int8_t* iBufferPtr = iBuffer;
-  unsigned int eighth_points = num_points / 8;
-  unsigned int number;
-
-  int16x8x2_t complexInput;
-  int8x8_t realOutput;
-  for(number = 0; number < eighth_points; number++){
-    complexInput = vld2q_s16(complexVectorPtr);
-    realOutput = vshrn_n_s16(complexInput.val[0], 8);
-    vst1_s8(iBufferPtr, realOutput);
-    complexVectorPtr += 16;
-    iBufferPtr += 8;
-  }
-
-  for(number = eighth_points*8; number < num_points; number++){
-    *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
-    complexVectorPtr++;
-  }
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    int8_t* iBufferPtr = iBuffer;
+    unsigned int eighth_points = num_points / 8;
+    unsigned int number;
+
+    int16x8x2_t complexInput;
+    int8x8_t realOutput;
+    for (number = 0; number < eighth_points; number++) {
+        complexInput = vld2q_s16(complexVectorPtr);
+        realOutput = vshrn_n_s16(complexInput.val[0], 8);
+        vst1_s8(iBufferPtr, realOutput);
+        complexVectorPtr += 16;
+        iBufferPtr += 8;
+    }
+
+    for (number = eighth_points * 8; number < num_points; number++) {
+        *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
+        complexVectorPtr++;
+    }
  }
  #endif
  
  #ifdef LV_HAVE_ORC
  
-extern void
-volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
+extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer,
+                                                      const lv_16sc_t* complexVector,
+                                                      unsigned int num_points);
  
-static inline void
-volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer,
+                                                        const lv_16sc_t* complexVector,
+                                                        unsigned int num_points)
  {
      volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points);
  }
@@ -233,54 +311,121 @@ volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer, const lv_16sc_t* complexVe
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
+                                                         const lv_16sc_t* complexVector,
+                                                         unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int8_t* iBufferPtr = iBuffer;
-  __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
-  __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-  __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
-
-  unsigned int thirtysecondPoints = num_points / 32;
-
-  for(number = 0; number < thirtysecondPoints; number++){
-    complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);  complexVectorPtr += 32;
-    complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);  complexVectorPtr += 32;
-
-    complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr);  complexVectorPtr += 32;
-    complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr);  complexVectorPtr += 32;
-
-    complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
-    complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
-
-    complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
-    complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
-
-    complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
-    complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
-
-    complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
-    complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
-
-    complexVal1 = _mm256_srai_epi16(complexVal1, 8);
-    complexVal3 = _mm256_srai_epi16(complexVal3, 8);
-
-    iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
-    iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
-
-    _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
-
-    iBufferPtr += 32;
-  }
-
-  number = thirtysecondPoints * 32;
-  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
-    int16ComplexVectorPtr++;
-  }
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int8_t* iBufferPtr = iBuffer;
+    __m256i iMoveMask1 = _mm256_set_epi8(0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0);
+    __m256i iMoveMask2 = _mm256_set_epi8(13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80);
+    __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
+
+    unsigned int thirtysecondPoints = num_points / 32;
+
+    for (number = 0; number < thirtysecondPoints; number++) {
+        complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
+        complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
+
+        complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
+        complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
+
+        complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
+        complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
+
+        complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
+        complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
+
+        complexVal1 = _mm256_srai_epi16(complexVal1, 8);
+        complexVal3 = _mm256_srai_epi16(complexVal3, 8);
+
+        iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
+        iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
+
+        _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
+
+        iBufferPtr += 32;
+    }
+
+    number = thirtysecondPoints * 32;
+    int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
+        int16ComplexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  #endif /* INCLUDED_volk_16ic_deinterleave_real_8i_u_H */
diff --git a/kernels/volk/volk_16ic_magnitude_16i.h b/kernels/volk/volk_16ic_magnitude_16i.h

index bbe72a8e3985a09a702f08a582813a1fdfcb105c..35b40cb56ca95abdbbf7b2cef2acde4f39d20f17 100644 (file)
--- a/kernels/volk/volk_16ic_magnitude_16i.h
+++ b/kernels/volk/volk_16ic_magnitude_16i.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_16ic_magnitude_16i(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_16ic_magnitude_16i(int16_t* magnitudeVector, const lv_16sc_t* complexVector,
+ * unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li complexVector: The complex input vector.
@@ -54,242 +54,255 @@
  #ifndef INCLUDED_volk_16ic_magnitude_16i_a_H
  #define INCLUDED_volk_16ic_magnitude_16i_a_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
  #include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_16ic_magnitude_16i_a_avx2(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_magnitude_16i_a_avx2(int16_t* magnitudeVector,
+                                                  const lv_16sc_t* complexVector,
+                                                  unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
-
-  __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
-  __m256 invScalar = _mm256_set1_ps(1.0f/SHRT_MAX);
-  __m256i int1, int2;
-  __m128i short1, short2;
-  __m256 cplxValue1, cplxValue2, result;
-  __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0);
-
-  for(;number < eighthPoints; number++){
-
-    int1 = _mm256_load_si256((__m256i*)complexVectorPtr);
-    complexVectorPtr += 16;
-    short1 = _mm256_extracti128_si256(int1,0);
-    short2 = _mm256_extracti128_si256(int1,1);
-
-    int1 = _mm256_cvtepi16_epi32(short1);
-    int2 = _mm256_cvtepi16_epi32(short2);
-    cplxValue1 = _mm256_cvtepi32_ps(int1);
-    cplxValue2 = _mm256_cvtepi32_ps(int2);
-
-    cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
-    cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
-
-    cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
-    cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
-    result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
-    result = _mm256_sqrt_ps(result); // Square root the values
-
-    result = _mm256_mul_ps(result, vScalar); // Scale the results
-
-    int1 = _mm256_cvtps_epi32(result);
-    int1 = _mm256_packs_epi32(int1, int1);
-    int1 = _mm256_permutevar8x32_epi32(int1, idx); //permute to compensate for shuffling in hadd and packs
-    short1 = _mm256_extracti128_si256(int1, 0);
-    _mm_store_si128((__m128i*)magnitudeVectorPtr,short1);
-    magnitudeVectorPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  complexVectorPtr = (const int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
-    const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
-    const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
-    *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
-  }
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    int16_t* magnitudeVectorPtr = magnitudeVector;
+
+    __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
+    __m256 invScalar = _mm256_set1_ps(1.0f / SHRT_MAX);
+    __m256i int1, int2;
+    __m128i short1, short2;
+    __m256 cplxValue1, cplxValue2, result;
+    __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
+
+    for (; number < eighthPoints; number++) {
+
+        int1 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        short1 = _mm256_extracti128_si256(int1, 0);
+        short2 = _mm256_extracti128_si256(int1, 1);
+
+        int1 = _mm256_cvtepi16_epi32(short1);
+        int2 = _mm256_cvtepi16_epi32(short2);
+        cplxValue1 = _mm256_cvtepi32_ps(int1);
+        cplxValue2 = _mm256_cvtepi32_ps(int2);
+
+        cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
+        cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
+
+        cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+        cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+        result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+        result = _mm256_sqrt_ps(result); // Square root the values
+
+        result = _mm256_mul_ps(result, vScalar); // Scale the results
+
+        int1 = _mm256_cvtps_epi32(result);
+        int1 = _mm256_packs_epi32(int1, int1);
+        int1 = _mm256_permutevar8x32_epi32(
+            int1, idx); // permute to compensate for shuffling in hadd and packs
+        short1 = _mm256_extracti128_si256(int1, 0);
+        _mm_store_si128((__m128i*)magnitudeVectorPtr, short1);
+        magnitudeVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    magnitudeVectorPtr = &magnitudeVector[number];
+    complexVectorPtr = (const int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
+        const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
+        const float val1Result =
+            sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
+        *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
  #ifdef LV_HAVE_SSE3
  #include <pmmintrin.h>
  
-static inline void
-volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector,
+                                                  const lv_16sc_t* complexVector,
+                                                  unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    int16_t* magnitudeVectorPtr = magnitudeVector;
  
-  __m128 vScalar = _mm_set_ps1(SHRT_MAX);
-  __m128 invScalar = _mm_set_ps1(1.0f/SHRT_MAX);
+    __m128 vScalar = _mm_set_ps1(SHRT_MAX);
+    __m128 invScalar = _mm_set_ps1(1.0f / SHRT_MAX);
  
-  __m128 cplxValue1, cplxValue2, result;
+    __m128 cplxValue1, cplxValue2, result;
  
-  __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
-  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+    __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
+    __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
  
-  for(;number < quarterPoints; number++){
+    for (; number < quarterPoints; number++) {
  
-    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
-    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
-    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
-    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+        inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+        inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+        inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+        inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
  
-    inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
-    inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
-    inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
-    inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+        inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+        inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+        inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+        inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
  
-    cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
-    cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+        cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+        cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
  
-    complexVectorPtr += 8;
+        complexVectorPtr += 8;
  
-    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
-    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+        cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+        cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
  
-    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
-    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+        cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+        cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
  
-    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+        result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
  
-    result = _mm_sqrt_ps(result); // Square root the values
+        result = _mm_sqrt_ps(result); // Square root the values
  
-    result = _mm_mul_ps(result, vScalar); // Scale the results
+        result = _mm_mul_ps(result, vScalar); // Scale the results
  
-    _mm_store_ps(outputFloatBuffer, result);
-    *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
-    *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
-    *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
-    *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
-  }
+        _mm_store_ps(outputFloatBuffer, result);
+        *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+        *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+        *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+        *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+    }
  
-  number = quarterPoints * 4;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  complexVectorPtr = (const int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
-    const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
-    const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
-    *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
-  }
+    number = quarterPoints * 4;
+    magnitudeVectorPtr = &magnitudeVector[number];
+    complexVectorPtr = (const int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
+        const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
+        const float val1Result =
+            sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
+        *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
+    }
  }
  #endif /* LV_HAVE_SSE3 */
  
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector,
+                                                 const lv_16sc_t* complexVector,
+                                                 unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    int16_t* magnitudeVectorPtr = magnitudeVector;
  
-  __m128 vScalar = _mm_set_ps1(SHRT_MAX);
-  __m128 invScalar = _mm_set_ps1(1.0f/SHRT_MAX);
+    __m128 vScalar = _mm_set_ps1(SHRT_MAX);
+    __m128 invScalar = _mm_set_ps1(1.0f / SHRT_MAX);
  
-  __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+    __m128 cplxValue1, cplxValue2, iValue, qValue, result;
  
-  __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4];
-  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+    __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4];
+    __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
  
-  for(;number < quarterPoints; number++){
+    for (; number < quarterPoints; number++) {
  
-    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
-    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
-    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
-    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+        inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+        inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+        inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+        inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
  
-    cplxValue1 = _mm_load_ps(inputFloatBuffer);
-    complexVectorPtr += 4;
+        cplxValue1 = _mm_load_ps(inputFloatBuffer);
+        complexVectorPtr += 4;
  
-    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
-    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
-    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
-    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+        inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+        inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+        inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+        inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
  
-    cplxValue2 = _mm_load_ps(inputFloatBuffer);
-    complexVectorPtr += 4;
+        cplxValue2 = _mm_load_ps(inputFloatBuffer);
+        complexVectorPtr += 4;
  
-    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
-    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+        cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+        cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
  
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-    // Arrange in q1q2q3q4 format
-    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+        // Arrange in i1i2i3i4 format
+        iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+        // Arrange in q1q2q3q4 format
+        qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
  
-    iValue = _mm_mul_ps(iValue, iValue); // Square the I values
-    qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+        iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+        qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
  
-    result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+        result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
  
-    result = _mm_sqrt_ps(result); // Square root the values
+        result = _mm_sqrt_ps(result); // Square root the values
  
-    result = _mm_mul_ps(result, vScalar); // Scale the results
+        result = _mm_mul_ps(result, vScalar); // Scale the results
  
-    _mm_store_ps(outputFloatBuffer, result);
-    *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
-    *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
-    *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
-    *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
-  }
+        _mm_store_ps(outputFloatBuffer, result);
+        *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+        *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+        *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+        *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+    }
  
-  number = quarterPoints * 4;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  complexVectorPtr = (const int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
-    const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
-    const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
-    *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
-  }
+    number = quarterPoints * 4;
+    magnitudeVectorPtr = &magnitudeVector[number];
+    complexVectorPtr = (const int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
+        const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
+        const float val1Result =
+            sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
+        *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_16ic_magnitude_16i_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_magnitude_16i_generic(int16_t* magnitudeVector,
+                                                   const lv_16sc_t* complexVector,
+                                                   unsigned int num_points)
  {
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
-  unsigned int number = 0;
-  const float scalar = SHRT_MAX;
-  for(number = 0; number < num_points; number++){
-    float real = ((float)(*complexVectorPtr++)) / scalar;
-    float imag = ((float)(*complexVectorPtr++)) / scalar;
-    *magnitudeVectorPtr++ = (int16_t)rintf(sqrtf((real*real) + (imag*imag)) * scalar);
-  }
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    int16_t* magnitudeVectorPtr = magnitudeVector;
+    unsigned int number = 0;
+    const float scalar = SHRT_MAX;
+    for (number = 0; number < num_points; number++) {
+        float real = ((float)(*complexVectorPtr++)) / scalar;
+        float imag = ((float)(*complexVectorPtr++)) / scalar;
+        *magnitudeVectorPtr++ =
+            (int16_t)rintf(sqrtf((real * real) + (imag * imag)) * scalar);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  #ifdef LV_HAVE_ORC_DISABLED
-extern void
-volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points);
-
-static inline void
-volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
+extern void volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector,
+                                               const lv_16sc_t* complexVector,
+                                               float scalar,
+                                               unsigned int num_points);
+
+static inline void volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector,
+                                                 const lv_16sc_t* complexVector,
+                                                 unsigned int num_points)
  {
-    volk_16ic_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, SHRT_MAX, num_points);
+    volk_16ic_magnitude_16i_a_orc_impl(
+        magnitudeVector, complexVector, SHRT_MAX, num_points);
  }
  #endif /* LV_HAVE_ORC */
  
@@ -300,71 +313,74 @@ volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_16sc_t* complex
  #ifndef INCLUDED_volk_16ic_magnitude_16i_u_H
  #define INCLUDED_volk_16ic_magnitude_16i_u_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
-#include <stdio.h>
  #include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector,
+                                                  const lv_16sc_t* complexVector,
+                                                  unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
-
-  __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
-  __m256 invScalar = _mm256_set1_ps(1.0f/SHRT_MAX);
-  __m256i int1, int2;
-  __m128i short1, short2;
-  __m256 cplxValue1, cplxValue2, result;
-  __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0);
-
-  for(;number < eighthPoints; number++){
-
-    int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
-    complexVectorPtr += 16;
-    short1 = _mm256_extracti128_si256(int1,0);
-    short2 = _mm256_extracti128_si256(int1,1);
-
-    int1 = _mm256_cvtepi16_epi32(short1);
-    int2 = _mm256_cvtepi16_epi32(short2);
-    cplxValue1 = _mm256_cvtepi32_ps(int1);
-    cplxValue2 = _mm256_cvtepi32_ps(int2);
-
-    cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
-    cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
-
-    cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
-    cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
-    result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
-    result = _mm256_sqrt_ps(result); // Square root the values
-
-    result = _mm256_mul_ps(result, vScalar); // Scale the results
-
-    int1 = _mm256_cvtps_epi32(result);
-    int1 = _mm256_packs_epi32(int1, int1);
-    int1 = _mm256_permutevar8x32_epi32(int1, idx); //permute to compensate for shuffling in hadd and packs
-    short1 = _mm256_extracti128_si256(int1, 0);
-    _mm_storeu_si128((__m128i*)magnitudeVectorPtr,short1);
-    magnitudeVectorPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  complexVectorPtr = (const int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
-    const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
-    const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
-    *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
-  }
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    int16_t* magnitudeVectorPtr = magnitudeVector;
+
+    __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
+    __m256 invScalar = _mm256_set1_ps(1.0f / SHRT_MAX);
+    __m256i int1, int2;
+    __m128i short1, short2;
+    __m256 cplxValue1, cplxValue2, result;
+    __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
+
+    for (; number < eighthPoints; number++) {
+
+        int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        short1 = _mm256_extracti128_si256(int1, 0);
+        short2 = _mm256_extracti128_si256(int1, 1);
+
+        int1 = _mm256_cvtepi16_epi32(short1);
+        int2 = _mm256_cvtepi16_epi32(short2);
+        cplxValue1 = _mm256_cvtepi32_ps(int1);
+        cplxValue2 = _mm256_cvtepi32_ps(int2);
+
+        cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
+        cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
+
+        cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+        cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+        result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+        result = _mm256_sqrt_ps(result); // Square root the values
+
+        result = _mm256_mul_ps(result, vScalar); // Scale the results
+
+        int1 = _mm256_cvtps_epi32(result);
+        int1 = _mm256_packs_epi32(int1, int1);
+        int1 = _mm256_permutevar8x32_epi32(
+            int1, idx); // permute to compensate for shuffling in hadd and packs
+        short1 = _mm256_extracti128_si256(int1, 0);
+        _mm_storeu_si128((__m128i*)magnitudeVectorPtr, short1);
+        magnitudeVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    magnitudeVectorPtr = &magnitudeVector[number];
+    complexVectorPtr = (const int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
+        const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
+        const float val1Result =
+            sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
+        *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -372,24 +388,25 @@ volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector, const lv_16sc_t* comple
  #include <arm_neon.h>
  #include <volk/volk_neon_intrinsics.h>
  
-static inline void
-volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector,
+                                                  const lv_16sc_t* complexVector,
+                                                  unsigned int num_points)
  {
      unsigned int number = 0;
      unsigned int quarter_points = num_points / 4;
-    
+
      const float scalar = SHRT_MAX;
      const float inv_scalar = 1.0f / scalar;
-    
+
      int16_t* magnitudeVectorPtr = magnitudeVector;
      const lv_16sc_t* complexVectorPtr = complexVector;
-    
+
      float32x4_t mag_vec;
      float32x4x2_t c_vec;
-    
-    for(number = 0; number < quarter_points; number++) {
+
+    for (number = 0; number < quarter_points; number++) {
          const int16x4x2_t c16_vec = vld2_s16((int16_t*)complexVectorPtr);
-        __VOLK_PREFETCH(complexVectorPtr+4);
+        __VOLK_PREFETCH(complexVectorPtr + 4);
          c_vec.val[0] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[0]));
          c_vec.val[1] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[1]));
          // Scale to close to 0-1
@@ -406,15 +423,16 @@ volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector, const lv_16sc_t* comple
          const int16x4_t mag16_vec = vmovn_s32(vcvtq_s32_f32(mag_vec));
          vst1_s16(magnitudeVectorPtr, mag16_vec);
          // Advance pointers
-        magnitudeVectorPtr+=4;
-        complexVectorPtr+=4;
+        magnitudeVectorPtr += 4;
+        complexVectorPtr += 4;
      }
-    
+
      // Deal with the rest
-    for(number = quarter_points * 4; number < num_points; number++) {
+    for (number = quarter_points * 4; number < num_points; number++) {
          const float real = lv_creal(*complexVectorPtr) * inv_scalar;
          const float imag = lv_cimag(*complexVectorPtr) * inv_scalar;
-        *magnitudeVectorPtr = (int16_t)rintf(sqrtf((real*real) + (imag*imag)) * scalar);
+        *magnitudeVectorPtr =
+            (int16_t)rintf(sqrtf((real * real) + (imag * imag)) * scalar);
          complexVectorPtr++;
          magnitudeVectorPtr++;
      }
diff --git a/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h b/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h

index 50d9341435a121097c2a513e64d1e85df90566e4..7425ec6a2b167d7b59adc46be71c588dcd8e0aa5 100644 (file)
--- a/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h
+++ b/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- *  void volk_16ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
- * \endcode
+ *  void volk_16ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const
+ * lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ \endcode
   *
   * \b Inputs
   * \li complexVector: The complex input vector of 16-bit shorts.
@@ -56,197 +56,214 @@
  #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
  #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
  #include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline
-void volk_16ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
-                                              const float scalar, unsigned int num_points)
+static inline void
+volk_16ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer,
+                                          float* qBuffer,
+                                          const lv_16sc_t* complexVector,
+                                          const float scalar,
+                                          unsigned int num_points)
  {
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-
-  uint64_t number = 0;
-  const uint64_t eighthPoints = num_points / 8;
-  __m256 cplxValue1, cplxValue2, iValue, qValue;
-  __m256i cplxValueA, cplxValueB;
-  __m128i cplxValue128;
-
-  __m256 invScalar = _mm256_set1_ps(1.0/scalar);
-  int16_t* complexVectorPtr = (int16_t*)complexVector;
-  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
-
-  for(;number < eighthPoints; number++){
-
-    cplxValueA = _mm256_load_si256((__m256i*) complexVectorPtr);
-    complexVectorPtr += 16;
-
-    //cvt
-    cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
-    cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
-    cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
-    cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
-    cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
-    cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
-
-    cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
-    cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
-
-    // Arrange in i1i2i3i4 format
-    iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-    iValue = _mm256_permutevar8x32_ps(iValue,idx);
-    // Arrange in q1q2q3q4 format
-    qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-    qValue = _mm256_permutevar8x32_ps(qValue,idx);
-
-    _mm256_store_ps(iBufferPtr, iValue);
-    _mm256_store_ps(qBufferPtr, qValue);
-
-    iBufferPtr += 8;
-    qBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  complexVectorPtr = (int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-    *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-  }
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+
+    uint64_t number = 0;
+    const uint64_t eighthPoints = num_points / 8;
+    __m256 cplxValue1, cplxValue2, iValue, qValue;
+    __m256i cplxValueA, cplxValueB;
+    __m128i cplxValue128;
+
+    __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+    __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    for (; number < eighthPoints; number++) {
+
+        cplxValueA = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 16;
+
+        // cvt
+        cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
+        cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
+        cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
+        cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
+        cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
+        cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
+
+        cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
+        cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
+
+        // Arrange in i1i2i3i4 format
+        iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+        iValue = _mm256_permutevar8x32_ps(iValue, idx);
+        // Arrange in q1q2q3q4 format
+        qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
+        qValue = _mm256_permutevar8x32_ps(qValue, idx);
+
+        _mm256_store_ps(iBufferPtr, iValue);
+        _mm256_store_ps(qBufferPtr, qValue);
+
+        iBufferPtr += 8;
+        qBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    complexVectorPtr = (int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+        *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline
-void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
-                                              const float scalar, unsigned int num_points)
+static inline void
+volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer,
+                                         float* qBuffer,
+                                         const lv_16sc_t* complexVector,
+                                         const float scalar,
+                                         unsigned int num_points)
  {
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
  
-  uint64_t number = 0;
-  const uint64_t quarterPoints = num_points / 4;
-  __m128 cplxValue1, cplxValue2, iValue, qValue;
+    uint64_t number = 0;
+    const uint64_t quarterPoints = num_points / 4;
+    __m128 cplxValue1, cplxValue2, iValue, qValue;
  
-  __m128 invScalar = _mm_set_ps1(1.0/scalar);
-  int16_t* complexVectorPtr = (int16_t*)complexVector;
+    __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
  
-  __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
+    __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
  
-  for(;number < quarterPoints; number++){
+    for (; number < quarterPoints; number++) {
  
-    floatBuffer[0] = (float)(complexVectorPtr[0]);
-    floatBuffer[1] = (float)(complexVectorPtr[1]);
-    floatBuffer[2] = (float)(complexVectorPtr[2]);
-    floatBuffer[3] = (float)(complexVectorPtr[3]);
+        floatBuffer[0] = (float)(complexVectorPtr[0]);
+        floatBuffer[1] = (float)(complexVectorPtr[1]);
+        floatBuffer[2] = (float)(complexVectorPtr[2]);
+        floatBuffer[3] = (float)(complexVectorPtr[3]);
  
-    floatBuffer[4] = (float)(complexVectorPtr[4]);
-    floatBuffer[5] = (float)(complexVectorPtr[5]);
-    floatBuffer[6] = (float)(complexVectorPtr[6]);
-    floatBuffer[7] = (float)(complexVectorPtr[7]);
+        floatBuffer[4] = (float)(complexVectorPtr[4]);
+        floatBuffer[5] = (float)(complexVectorPtr[5]);
+        floatBuffer[6] = (float)(complexVectorPtr[6]);
+        floatBuffer[7] = (float)(complexVectorPtr[7]);
  
-    cplxValue1 = _mm_load_ps(&floatBuffer[0]);
-    cplxValue2 = _mm_load_ps(&floatBuffer[4]);
+        cplxValue1 = _mm_load_ps(&floatBuffer[0]);
+        cplxValue2 = _mm_load_ps(&floatBuffer[4]);
  
-    complexVectorPtr += 8;
+        complexVectorPtr += 8;
  
-    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
-    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+        cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+        cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
  
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-    // Arrange in q1q2q3q4 format
-    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+        // Arrange in i1i2i3i4 format
+        iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+        // Arrange in q1q2q3q4 format
+        qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
  
-    _mm_store_ps(iBufferPtr, iValue);
-    _mm_store_ps(qBufferPtr, qValue);
+        _mm_store_ps(iBufferPtr, iValue);
+        _mm_store_ps(qBufferPtr, qValue);
  
-    iBufferPtr += 4;
-    qBufferPtr += 4;
-  }
+        iBufferPtr += 4;
+        qBufferPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  complexVectorPtr = (int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-    *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-  }
+    number = quarterPoints * 4;
+    complexVectorPtr = (int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+        *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  #ifdef LV_HAVE_GENERIC
  
  static inline void
-volk_16ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
-                                           const float scalar, unsigned int num_points)
+volk_16ic_s32f_deinterleave_32f_x2_generic(float* iBuffer,
+                                           float* qBuffer,
+                                           const lv_16sc_t* complexVector,
+                                           const float scalar,
+                                           unsigned int num_points)
  {
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-  unsigned int number;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-    *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-  }
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+    unsigned int number;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+        *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
-static inline void
-volk_16ic_s32f_deinterleave_32f_x2_neon(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
-                                        const float scalar, unsigned int num_points)
+static inline void volk_16ic_s32f_deinterleave_32f_x2_neon(float* iBuffer,
+                                                           float* qBuffer,
+                                                           const lv_16sc_t* complexVector,
+                                                           const float scalar,
+                                                           unsigned int num_points)
  {
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-  unsigned int eighth_points = num_points / 4;
-  unsigned int number;
-  float iScalar = 1.f/scalar;
-  float32x4_t invScalar;
-  invScalar = vld1q_dup_f32(&iScalar);
-
-  int16x4x2_t complexInput_s16;
-  int32x4x2_t complexInput_s32;
-  float32x4x2_t complexFloat;
-
-  for(number = 0; number < eighth_points; number++){
-    complexInput_s16 = vld2_s16(complexVectorPtr);
-    complexInput_s32.val[0] = vmovl_s16(complexInput_s16.val[0]);
-    complexInput_s32.val[1] = vmovl_s16(complexInput_s16.val[1]);
-    complexFloat.val[0] = vcvtq_f32_s32(complexInput_s32.val[0]);
-    complexFloat.val[1] = vcvtq_f32_s32(complexInput_s32.val[1]);
-    complexFloat.val[0] = vmulq_f32(complexFloat.val[0], invScalar);
-    complexFloat.val[1] = vmulq_f32(complexFloat.val[1], invScalar);
-    vst1q_f32(iBufferPtr, complexFloat.val[0]);
-    vst1q_f32(qBufferPtr, complexFloat.val[1]);
-    complexVectorPtr += 8;
-    iBufferPtr += 4;
-    qBufferPtr += 4;
-  }
-
-  for(number = eighth_points*4; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-    *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-  }
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+    unsigned int eighth_points = num_points / 4;
+    unsigned int number;
+    float iScalar = 1.f / scalar;
+    float32x4_t invScalar;
+    invScalar = vld1q_dup_f32(&iScalar);
+
+    int16x4x2_t complexInput_s16;
+    int32x4x2_t complexInput_s32;
+    float32x4x2_t complexFloat;
+
+    for (number = 0; number < eighth_points; number++) {
+        complexInput_s16 = vld2_s16(complexVectorPtr);
+        complexInput_s32.val[0] = vmovl_s16(complexInput_s16.val[0]);
+        complexInput_s32.val[1] = vmovl_s16(complexInput_s16.val[1]);
+        complexFloat.val[0] = vcvtq_f32_s32(complexInput_s32.val[0]);
+        complexFloat.val[1] = vcvtq_f32_s32(complexInput_s32.val[1]);
+        complexFloat.val[0] = vmulq_f32(complexFloat.val[0], invScalar);
+        complexFloat.val[1] = vmulq_f32(complexFloat.val[1], invScalar);
+        vst1q_f32(iBufferPtr, complexFloat.val[0]);
+        vst1q_f32(qBufferPtr, complexFloat.val[1]);
+        complexVectorPtr += 8;
+        iBufferPtr += 4;
+        qBufferPtr += 4;
+    }
+
+    for (number = eighth_points * 4; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+        *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  #ifdef LV_HAVE_ORC
-extern void
-volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
-                                              const float scalar, unsigned int num_points);
+extern void volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer,
+                                                          float* qBuffer,
+                                                          const lv_16sc_t* complexVector,
+                                                          const float scalar,
+                                                          unsigned int num_points);
  
  static inline void
-volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
-                                         const float scalar, unsigned int num_points)
+volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer,
+                                         float* qBuffer,
+                                         const lv_16sc_t* complexVector,
+                                         const float scalar,
+                                         unsigned int num_points)
  {
-  volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points);
+    volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(
+        iBuffer, qBuffer, complexVector, scalar, num_points);
  }
  #endif /* LV_HAVE_ORC */
  
@@ -257,66 +274,69 @@ volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer, float* qBuffer, const l
  #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H
  #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
  #include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline
-void volk_16ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
-                                              const float scalar, unsigned int num_points)
+static inline void
+volk_16ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer,
+                                          float* qBuffer,
+                                          const lv_16sc_t* complexVector,
+                                          const float scalar,
+                                          unsigned int num_points)
  {
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-
-  uint64_t number = 0;
-  const uint64_t eighthPoints = num_points / 8;
-  __m256 cplxValue1, cplxValue2, iValue, qValue;
-  __m256i cplxValueA, cplxValueB;
-  __m128i cplxValue128;
-
-  __m256 invScalar = _mm256_set1_ps(1.0/scalar);
-  int16_t* complexVectorPtr = (int16_t*)complexVector;
-  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
-
-  for(;number < eighthPoints; number++){
-
-    cplxValueA = _mm256_loadu_si256((__m256i*) complexVectorPtr);
-    complexVectorPtr += 16;
-
-    //cvt
-    cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
-    cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
-    cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
-    cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
-    cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
-    cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
-
-    cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
-    cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
-
-    // Arrange in i1i2i3i4 format
-    iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-    iValue = _mm256_permutevar8x32_ps(iValue,idx);
-    // Arrange in q1q2q3q4 format
-    qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-    qValue = _mm256_permutevar8x32_ps(qValue,idx);
-
-    _mm256_storeu_ps(iBufferPtr, iValue);
-    _mm256_storeu_ps(qBufferPtr, qValue);
-
-    iBufferPtr += 8;
-    qBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  complexVectorPtr = (int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-    *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-  }
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+
+    uint64_t number = 0;
+    const uint64_t eighthPoints = num_points / 8;
+    __m256 cplxValue1, cplxValue2, iValue, qValue;
+    __m256i cplxValueA, cplxValueB;
+    __m128i cplxValue128;
+
+    __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+    __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    for (; number < eighthPoints; number++) {
+
+        cplxValueA = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 16;
+
+        // cvt
+        cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
+        cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
+        cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
+        cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
+        cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
+        cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
+
+        cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
+        cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
+
+        // Arrange in i1i2i3i4 format
+        iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+        iValue = _mm256_permutevar8x32_ps(iValue, idx);
+        // Arrange in q1q2q3q4 format
+        qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
+        qValue = _mm256_permutevar8x32_ps(qValue, idx);
+
+        _mm256_storeu_ps(iBufferPtr, iValue);
+        _mm256_storeu_ps(qBufferPtr, qValue);
+
+        iBufferPtr += 8;
+        qBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    complexVectorPtr = (int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+        *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
diff --git a/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h b/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h

index 713e6a1f7600eae10fc25b0ad31a74ecc412eaeb..8b72d1ceb4f9e236b305a3e637cd05606f6ac5d8 100644 (file)
--- a/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h
+++ b/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h
@@ -31,8 +31,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- *  void volk_16ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
- * \endcode
+ *  void volk_16ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_16sc_t*
+ * complexVector, const float scalar, unsigned int num_points){ \endcode
   *
   * \b Inputs
   * \li complexVector: The complex input vector of 16-bit shorts.
@@ -56,55 +56,88 @@
  #ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H
  #define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
  #include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
  static inline void
-volk_16ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_16sc_t* complexVector,
-                                              const float scalar, unsigned int num_points)
+volk_16ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer,
+                                            const lv_16sc_t* complexVector,
+                                            const float scalar,
+                                            unsigned int num_points)
  {
-  float* iBufferPtr = iBuffer;
-
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  __m256 iFloatValue;
-
-  const float iScalar= 1.0 / scalar;
-  __m256 invScalar = _mm256_set1_ps(iScalar);
-  __m256i complexVal, iIntVal;
-  __m128i complexVal128;
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
-
-  __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
-
-  for(;number < eighthPoints; number++){
-    complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
-    complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
-    complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
-    complexVal128 = _mm256_extracti128_si256(complexVal, 0);
-
-    iIntVal = _mm256_cvtepi16_epi32(complexVal128);
-    iFloatValue = _mm256_cvtepi32_ps(iIntVal);
-
-    iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
-
-    _mm256_store_ps(iBufferPtr, iFloatValue);
-
-    iBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
-    sixteenTComplexVectorPtr++;
-  }
-
+    float* iBufferPtr = iBuffer;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 iFloatValue;
+
+    const float iScalar = 1.0 / scalar;
+    __m256 invScalar = _mm256_set1_ps(iScalar);
+    __m256i complexVal, iIntVal;
+    __m128i complexVal128;
+    int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+    __m256i moveMask = _mm256_set_epi8(0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       13,
+                                       12,
+                                       9,
+                                       8,
+                                       5,
+                                       4,
+                                       1,
+                                       0,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       13,
+                                       12,
+                                       9,
+                                       8,
+                                       5,
+                                       4,
+                                       1,
+                                       0);
+
+    for (; number < eighthPoints; number++) {
+        complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
+        complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+        complexVal128 = _mm256_extracti128_si256(complexVal, 0);
+
+        iIntVal = _mm256_cvtepi16_epi32(complexVal128);
+        iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+
+        iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+
+        _mm256_store_ps(iBufferPtr, iFloatValue);
+
+        iBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
+        sixteenTComplexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -112,44 +145,47 @@ volk_16ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_16sc_t* com
  #include <smmintrin.h>
  
  static inline void
-volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_16sc_t* complexVector,
-                                              const float scalar, unsigned int num_points)
+volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer,
+                                              const lv_16sc_t* complexVector,
+                                              const float scalar,
+                                              unsigned int num_points)
  {
-  float* iBufferPtr = iBuffer;
-
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    float* iBufferPtr = iBuffer;
  
-  __m128 iFloatValue;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  const float iScalar= 1.0 / scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-  __m128i complexVal, iIntVal;
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
+    __m128 iFloatValue;
  
-  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    __m128i complexVal, iIntVal;
+    int8_t* complexVectorPtr = (int8_t*)complexVector;
  
-  for(;number < quarterPoints; number++){
-    complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
-    complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+    __m128i moveMask = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
  
-    iIntVal = _mm_cvtepi16_epi32(complexVal);
-    iFloatValue = _mm_cvtepi32_ps(iIntVal);
+    for (; number < quarterPoints; number++) {
+        complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        complexVal = _mm_shuffle_epi8(complexVal, moveMask);
  
-    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+        iIntVal = _mm_cvtepi16_epi32(complexVal);
+        iFloatValue = _mm_cvtepi32_ps(iIntVal);
  
-    _mm_store_ps(iBufferPtr, iFloatValue);
+        iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
  
-    iBufferPtr += 4;
-  }
+        _mm_store_ps(iBufferPtr, iFloatValue);
  
-  number = quarterPoints * 4;
-  int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
-    sixteenTComplexVectorPtr++;
-  }
+        iBufferPtr += 4;
+    }
  
+    number = quarterPoints * 4;
+    int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
+        sixteenTComplexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_SSE4_1 */
  
@@ -157,59 +193,66 @@ volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_16sc_t* c
  #include <xmmintrin.h>
  
  static inline void
-volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_16sc_t* complexVector,
-                                           const float scalar, unsigned int num_points)
+volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer,
+                                           const lv_16sc_t* complexVector,
+                                           const float scalar,
+                                           unsigned int num_points)
  {
-  float* iBufferPtr = iBuffer;
+    float* iBufferPtr = iBuffer;
  
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-  __m128 iValue;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    __m128 iValue;
  
-  const float iScalar = 1.0/scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-  int16_t* complexVectorPtr = (int16_t*)complexVector;
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
  
-  __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+    __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
  
-  for(;number < quarterPoints; number++){
-    floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-    floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-    floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-    floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+    for (; number < quarterPoints; number++) {
+        floatBuffer[0] = (float)(*complexVectorPtr);
+        complexVectorPtr += 2;
+        floatBuffer[1] = (float)(*complexVectorPtr);
+        complexVectorPtr += 2;
+        floatBuffer[2] = (float)(*complexVectorPtr);
+        complexVectorPtr += 2;
+        floatBuffer[3] = (float)(*complexVectorPtr);
+        complexVectorPtr += 2;
  
-    iValue = _mm_load_ps(floatBuffer);
+        iValue = _mm_load_ps(floatBuffer);
  
-    iValue = _mm_mul_ps(iValue, invScalar);
+        iValue = _mm_mul_ps(iValue, invScalar);
  
-    _mm_store_ps(iBufferPtr, iValue);
+        _mm_store_ps(iBufferPtr, iValue);
  
-    iBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  complexVectorPtr = (int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar;
-    complexVectorPtr++;
-  }
+        iBufferPtr += 4;
+    }
  
+    number = quarterPoints * 4;
+    complexVectorPtr = (int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  #ifdef LV_HAVE_GENERIC
  static inline void
-volk_16ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_16sc_t* complexVector,
-                                             const float scalar, unsigned int num_points)
+volk_16ic_s32f_deinterleave_real_32f_generic(float* iBuffer,
+                                             const lv_16sc_t* complexVector,
+                                             const float scalar,
+                                             unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  float* iBufferPtr = iBuffer;
-  const float invScalar = 1.0 / scalar;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
-    complexVectorPtr++;
-  }
+    unsigned int number = 0;
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    float* iBufferPtr = iBuffer;
+    const float invScalar = 1.0 / scalar;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -219,55 +262,88 @@ volk_16ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_16sc_t* co
  #ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_u_H
  #define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_u_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
  #include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
  static inline void
-volk_16ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer, const lv_16sc_t* complexVector,
-                                              const float scalar, unsigned int num_points)
+volk_16ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer,
+                                            const lv_16sc_t* complexVector,
+                                            const float scalar,
+                                            unsigned int num_points)
  {
-  float* iBufferPtr = iBuffer;
-
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  __m256 iFloatValue;
-
-  const float iScalar= 1.0 / scalar;
-  __m256 invScalar = _mm256_set1_ps(iScalar);
-  __m256i complexVal, iIntVal;
-  __m128i complexVal128;
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
-
-  __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
-
-  for(;number < eighthPoints; number++){
-    complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
-    complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
-    complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
-    complexVal128 = _mm256_extracti128_si256(complexVal, 0);
-
-    iIntVal = _mm256_cvtepi16_epi32(complexVal128);
-    iFloatValue = _mm256_cvtepi32_ps(iIntVal);
-
-    iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
-
-    _mm256_storeu_ps(iBufferPtr, iFloatValue);
-
-    iBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
-    sixteenTComplexVectorPtr++;
-  }
-
+    float* iBufferPtr = iBuffer;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 iFloatValue;
+
+    const float iScalar = 1.0 / scalar;
+    __m256 invScalar = _mm256_set1_ps(iScalar);
+    __m256i complexVal, iIntVal;
+    __m128i complexVal128;
+    int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+    __m256i moveMask = _mm256_set_epi8(0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       13,
+                                       12,
+                                       9,
+                                       8,
+                                       5,
+                                       4,
+                                       1,
+                                       0,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       13,
+                                       12,
+                                       9,
+                                       8,
+                                       5,
+                                       4,
+                                       1,
+                                       0);
+
+    for (; number < eighthPoints; number++) {
+        complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
+        complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+        complexVal128 = _mm256_extracti128_si256(complexVal, 0);
+
+        iIntVal = _mm256_cvtepi16_epi32(complexVal128);
+        iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+
+        iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+
+        _mm256_storeu_ps(iBufferPtr, iFloatValue);
+
+        iBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
+        sixteenTComplexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
diff --git a/kernels/volk/volk_16ic_s32f_magnitude_32f.h b/kernels/volk/volk_16ic_s32f_magnitude_32f.h

index bb0459cd430d37e4c86e940817c064bc37bb748d..c3e36055e26eafa15f59d1fcb9cdee065e72e5b7 100644 (file)
--- a/kernels/volk/volk_16ic_s32f_magnitude_32f.h
+++ b/kernels/volk/volk_16ic_s32f_magnitude_32f.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_16ic_s32f_magnitude_32f(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points)
- * \endcode
+ * void volk_16ic_s32f_magnitude_32f(float* magnitudeVector, const lv_16sc_t*
+ * complexVector, const float scalar, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li complexVector: The complex input vector of complex 16-bit shorts.
@@ -55,67 +55,68 @@
  #ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
  #define INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
-#include <stdio.h>
  #include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_16ic_s32f_magnitude_32f_a_avx2(float* magnitudeVector, const lv_16sc_t* complexVector,
-                                    const float scalar, unsigned int num_points)
+static inline void volk_16ic_s32f_magnitude_32f_a_avx2(float* magnitudeVector,
+                                                       const lv_16sc_t* complexVector,
+                                                       const float scalar,
+                                                       unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
  
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
+    __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
  
-  __m256 invScalar = _mm256_set1_ps(1.0/scalar);
+    __m256 cplxValue1, cplxValue2, result;
+    __m256i int1, int2;
+    __m128i short1, short2;
+    __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
  
-  __m256 cplxValue1, cplxValue2, result;
-  __m256i int1, int2;
-  __m128i short1, short2;
-  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
+    for (; number < eighthPoints; number++) {
  
-  for(;number < eighthPoints; number++){
-    
-    int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
-    complexVectorPtr += 16;
-    short1 = _mm256_extracti128_si256(int1,0);
-    short2 = _mm256_extracti128_si256(int1,1);
+        int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        short1 = _mm256_extracti128_si256(int1, 0);
+        short2 = _mm256_extracti128_si256(int1, 1);
  
-    int1 = _mm256_cvtepi16_epi32(short1);
-    int2 = _mm256_cvtepi16_epi32(short2);
-    cplxValue1 = _mm256_cvtepi32_ps(int1);
-    cplxValue2 = _mm256_cvtepi32_ps(int2);
+        int1 = _mm256_cvtepi16_epi32(short1);
+        int2 = _mm256_cvtepi16_epi32(short2);
+        cplxValue1 = _mm256_cvtepi32_ps(int1);
+        cplxValue2 = _mm256_cvtepi32_ps(int2);
  
-    cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
-    cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
+        cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
+        cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
  
-    cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
-    cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+        cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+        cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
  
-    result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-    result = _mm256_permutevar8x32_ps(result, idx);
+        result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+        result = _mm256_permutevar8x32_ps(result, idx);
  
-    result = _mm256_sqrt_ps(result); // Square root the values
+        result = _mm256_sqrt_ps(result); // Square root the values
  
-    _mm256_store_ps(magnitudeVectorPtr, result);
+        _mm256_store_ps(magnitudeVectorPtr, result);
  
-    magnitudeVectorPtr += 8;
-  }
+        magnitudeVectorPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  complexVectorPtr = (const int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    float val1Real = (float)(*complexVectorPtr++) / scalar;
-    float val1Imag = (float)(*complexVectorPtr++) / scalar;
-    *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
-  }
+    number = eighthPoints * 8;
+    magnitudeVectorPtr = &magnitudeVector[number];
+    complexVectorPtr = (const int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        float val1Real = (float)(*complexVectorPtr++) / scalar;
+        float val1Imag = (float)(*complexVectorPtr++) / scalar;
+        *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -123,127 +124,129 @@ volk_16ic_s32f_magnitude_32f_a_avx2(float* magnitudeVector, const lv_16sc_t* com
  #ifdef LV_HAVE_SSE3
  #include <pmmintrin.h>
  
-static inline void
-volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, const lv_16sc_t* complexVector,
-                                    const float scalar, unsigned int num_points)
+static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector,
+                                                       const lv_16sc_t* complexVector,
+                                                       const float scalar,
+                                                       unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
  
-  __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    __m128 invScalar = _mm_set_ps1(1.0 / scalar);
  
-  __m128 cplxValue1, cplxValue2, result;
+    __m128 cplxValue1, cplxValue2, result;
  
-  __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
+    __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
  
-  for(;number < quarterPoints; number++){
+    for (; number < quarterPoints; number++) {
  
-    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
-    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
-    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
-    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+        inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+        inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+        inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+        inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
  
-    inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
-    inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
-    inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
-    inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+        inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+        inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+        inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+        inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
  
-    cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
-    cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+        cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+        cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
  
-    complexVectorPtr += 8;
+        complexVectorPtr += 8;
  
-    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
-    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+        cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+        cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
  
-    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
-    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+        cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+        cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
  
-    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+        result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
  
-    result = _mm_sqrt_ps(result); // Square root the values
+        result = _mm_sqrt_ps(result); // Square root the values
  
-    _mm_store_ps(magnitudeVectorPtr, result);
+        _mm_store_ps(magnitudeVectorPtr, result);
  
-    magnitudeVectorPtr += 4;
-  }
+        magnitudeVectorPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  complexVectorPtr = (const int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    float val1Real = (float)(*complexVectorPtr++) / scalar;
-    float val1Imag = (float)(*complexVectorPtr++) / scalar;
-    *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
-  }
+    number = quarterPoints * 4;
+    magnitudeVectorPtr = &magnitudeVector[number];
+    complexVectorPtr = (const int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        float val1Real = (float)(*complexVectorPtr++) / scalar;
+        float val1Imag = (float)(*complexVectorPtr++) / scalar;
+        *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
  }
  #endif /* LV_HAVE_SSE3 */
  
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, const lv_16sc_t* complexVector,
-                                   const float scalar, unsigned int num_points)
+static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector,
+                                                      const lv_16sc_t* complexVector,
+                                                      const float scalar,
+                                                      unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
  
-  const float iScalar = 1.0 / scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
  
-  __m128 cplxValue1, cplxValue2, result, re, im;
+    __m128 cplxValue1, cplxValue2, result, re, im;
  
-  __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
+    __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
  
-  for(;number < quarterPoints; number++){
-    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
-    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
-    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
-    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+    for (; number < quarterPoints; number++) {
+        inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+        inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+        inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+        inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
  
-    inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
-    inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
-    inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
-    inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+        inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+        inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+        inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+        inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
  
-    cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
-    cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+        cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+        cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
  
-    re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
-    im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
+        re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
+        im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
  
-    complexVectorPtr += 8;
+        complexVectorPtr += 8;
  
-    cplxValue1 = _mm_mul_ps(re, invScalar);
-    cplxValue2 = _mm_mul_ps(im, invScalar);
+        cplxValue1 = _mm_mul_ps(re, invScalar);
+        cplxValue2 = _mm_mul_ps(im, invScalar);
  
-    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
-    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+        cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+        cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
  
-    result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+        result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
  
-    result = _mm_sqrt_ps(result); // Square root the values
+        result = _mm_sqrt_ps(result); // Square root the values
  
-    _mm_store_ps(magnitudeVectorPtr, result);
+        _mm_store_ps(magnitudeVectorPtr, result);
  
-    magnitudeVectorPtr += 4;
-  }
+        magnitudeVectorPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  complexVectorPtr = (const int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    float val1Real = (float)(*complexVectorPtr++) * iScalar;
-    float val1Imag = (float)(*complexVectorPtr++) * iScalar;
-    *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
-  }
+    number = quarterPoints * 4;
+    magnitudeVectorPtr = &magnitudeVector[number];
+    complexVectorPtr = (const int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        float val1Real = (float)(*complexVectorPtr++) * iScalar;
+        float val1Imag = (float)(*complexVectorPtr++) * iScalar;
+        *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
  }
  
  
@@ -251,33 +254,37 @@ volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, const lv_16sc_t* comp
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector, const lv_16sc_t* complexVector,
-                                     const float scalar, unsigned int num_points)
+static inline void volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector,
+                                                        const lv_16sc_t* complexVector,
+                                                        const float scalar,
+                                                        unsigned int num_points)
  {
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-  unsigned int number = 0;
-  const float invScalar = 1.0 / scalar;
-  for(number = 0; number < num_points; number++){
-    float real = ( (float) (*complexVectorPtr++)) * invScalar;
-    float imag = ( (float) (*complexVectorPtr++)) * invScalar;
-    *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
-  }
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+    unsigned int number = 0;
+    const float invScalar = 1.0 / scalar;
+    for (number = 0; number < num_points; number++) {
+        float real = ((float)(*complexVectorPtr++)) * invScalar;
+        float imag = ((float)(*complexVectorPtr++)) * invScalar;
+        *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  #ifdef LV_HAVE_ORC_DISABLED
  
-extern void
-volk_16ic_s32f_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector,
-                                        const float scalar, unsigned int num_points);
+extern void volk_16ic_s32f_magnitude_32f_a_orc_impl(float* magnitudeVector,
+                                                    const lv_16sc_t* complexVector,
+                                                    const float scalar,
+                                                    unsigned int num_points);
  
-static inline void
-volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector, const lv_16sc_t* complexVector,
-                                   const float scalar, unsigned int num_points)
+static inline void volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector,
+                                                      const lv_16sc_t* complexVector,
+                                                      const float scalar,
+                                                      unsigned int num_points)
  {
-  volk_16ic_s32f_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, scalar, num_points);
+    volk_16ic_s32f_magnitude_32f_a_orc_impl(
+        magnitudeVector, complexVector, scalar, num_points);
  }
  #endif /* LV_HAVE_ORC */
  
@@ -287,69 +294,69 @@ volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector, const lv_16sc_t* comp
  #ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_u_H
  #define INCLUDED_volk_16ic_s32f_magnitude_32f_u_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
-#include <stdio.h>
  #include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_16ic_s32f_magnitude_32f_u_avx2(float* magnitudeVector, const lv_16sc_t* complexVector,
-                                    const float scalar, unsigned int num_points)
+static inline void volk_16ic_s32f_magnitude_32f_u_avx2(float* magnitudeVector,
+                                                       const lv_16sc_t* complexVector,
+                                                       const float scalar,
+                                                       unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
  
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
+    __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
  
-  __m256 invScalar = _mm256_set1_ps(1.0/scalar);
+    __m256 cplxValue1, cplxValue2, result;
+    __m256i int1, int2;
+    __m128i short1, short2;
+    __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
  
-  __m256 cplxValue1, cplxValue2, result;
-  __m256i int1, int2;
-  __m128i short1, short2;
-  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
+    for (; number < eighthPoints; number++) {
  
-  for(;number < eighthPoints; number++){
-    
-    int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
-    complexVectorPtr += 16;
-    short1 = _mm256_extracti128_si256(int1,0);
-    short2 = _mm256_extracti128_si256(int1,1);
+        int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        short1 = _mm256_extracti128_si256(int1, 0);
+        short2 = _mm256_extracti128_si256(int1, 1);
  
-    int1 = _mm256_cvtepi16_epi32(short1);
-    int2 = _mm256_cvtepi16_epi32(short2);
-    cplxValue1 = _mm256_cvtepi32_ps(int1);
-    cplxValue2 = _mm256_cvtepi32_ps(int2);
+        int1 = _mm256_cvtepi16_epi32(short1);
+        int2 = _mm256_cvtepi16_epi32(short2);
+        cplxValue1 = _mm256_cvtepi32_ps(int1);
+        cplxValue2 = _mm256_cvtepi32_ps(int2);
  
-    cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
-    cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
+        cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
+        cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
  
-    cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
-    cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+        cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+        cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
  
-    result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-    result = _mm256_permutevar8x32_ps(result, idx);
+        result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+        result = _mm256_permutevar8x32_ps(result, idx);
  
-    result = _mm256_sqrt_ps(result); // Square root the values
+        result = _mm256_sqrt_ps(result); // Square root the values
  
-    _mm256_storeu_ps(magnitudeVectorPtr, result);
+        _mm256_storeu_ps(magnitudeVectorPtr, result);
  
-    magnitudeVectorPtr += 8;
-  }
+        magnitudeVectorPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  complexVectorPtr = (const int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    float val1Real = (float)(*complexVectorPtr++) / scalar;
-    float val1Imag = (float)(*complexVectorPtr++) / scalar;
-    *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
-  }
+    number = eighthPoints * 8;
+    magnitudeVectorPtr = &magnitudeVector[number];
+    complexVectorPtr = (const int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        float val1Real = (float)(*complexVectorPtr++) / scalar;
+        float val1Imag = (float)(*complexVectorPtr++) / scalar;
+        *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
  #endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_u_H */
-
diff --git a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h

index ae10cff9758f5bab9058c195d3eea11c094f1d5c..a1a0e8c957c1fac5878affe0a98e8909dd3becfe 100644 (file)
--- a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h
+++ b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h
@@ -25,18 +25,20 @@
   *
   * \b Overview
   *
- * Multiplies two input complex vectors (16-bit integer each component) and accumulates them,
- * storing the result. Results are saturated so never go beyond the limits of the data type.
+ * Multiplies two input complex vectors (16-bit integer each component) and accumulates
+ * them, storing the result. Results are saturated so never go beyond the limits of the
+ * data type.
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_16ic_x2_dot_prod_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points);
- * \endcode
+ * void volk_16ic_x2_dot_prod_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const
+ * lv_16sc_t* in_b, unsigned int num_points); \endcode
   *
   * \b Inputs
   * \li in_a:          One of the vectors to be multiplied and accumulated.
   * \li in_b:          The other vector to be multiplied and accumulated.
- * \li num_points:    Number of complex values to be multiplied together, accumulated and stored into \p result
+ * \li num_points:    Number of complex values to be multiplied together, accumulated and
+ * stored into \p result
   *
   * \b Outputs
   * \li result:        Value of the accumulated result.
@@ -46,22 +48,25 @@
  #ifndef INCLUDED_volk_16ic_x2_dot_prod_16ic_H
  #define INCLUDED_volk_16ic_x2_dot_prod_16ic_H
  
+#include <volk/saturation_arithmetic.h>
  #include <volk/volk_common.h>
  #include <volk/volk_complex.h>
-#include <volk/saturation_arithmetic.h>
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result,
+                                                      const lv_16sc_t* in_a,
+                                                      const lv_16sc_t* in_b,
+                                                      unsigned int num_points)
  {
      result[0] = lv_cmake((int16_t)0, (int16_t)0);
      unsigned int n;
-    for (n = 0; n < num_points; n++)
-        {
-            lv_16sc_t tmp = in_a[n] * in_b[n];
-            result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp) ));
-        }
+    for (n = 0; n < num_points; n++) {
+        lv_16sc_t tmp = in_a[n] * in_b[n];
+        result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)),
+                             sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp)));
+    }
  }
  
  #endif /*LV_HAVE_GENERIC*/
@@ -70,7 +75,10 @@ static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, const l
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out,
+                                                     const lv_16sc_t* in_a,
+                                                     const lv_16sc_t* in_b,
+                                                     unsigned int num_points)
  {
      lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
  
@@ -81,62 +89,67 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16
      const lv_16sc_t* _in_b = in_b;
      lv_16sc_t* _out = out;
  
-    if (sse_iters > 0)
-        {
-            __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc;
-            __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
+    if (sse_iters > 0) {
+        __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
+            realcacc, imagcacc;
+        __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
  
-            realcacc = _mm_setzero_si128();
-            imagcacc = _mm_setzero_si128();
+        realcacc = _mm_setzero_si128();
+        imagcacc = _mm_setzero_si128();
  
-            mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
-            mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
+        mask_imag = _mm_set_epi8(
+            0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
+        mask_real = _mm_set_epi8(
+            0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
  
-            for(number = 0; number < sse_iters; number++)
-                {
-                    // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
-                    a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
-                    __VOLK_PREFETCH(_in_a + 8);
-                    b = _mm_load_si128((__m128i*)_in_b);
-                    __VOLK_PREFETCH(_in_b + 8);
-                    c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
+        for (number = 0; number < sse_iters; number++) {
+            // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
+            a = _mm_load_si128(
+                (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+            __VOLK_PREFETCH(_in_a + 8);
+            b = _mm_load_si128((__m128i*)_in_b);
+            __VOLK_PREFETCH(_in_b + 8);
+            c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
  
-                    c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
-                    real = _mm_subs_epi16(c, c_sr);
+            c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
+                                         // zeros, and store the results in dst.
+            real = _mm_subs_epi16(c, c_sr);
  
-                    b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
-                    a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+            b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+            a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
  
-                    imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
-                    imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+            imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+            imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
  
-                    imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic!
+            imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
  
-                    realcacc = _mm_adds_epi16(realcacc, real);
-                    imagcacc = _mm_adds_epi16(imagcacc, imag);
+            realcacc = _mm_adds_epi16(realcacc, real);
+            imagcacc = _mm_adds_epi16(imagcacc, imag);
  
-                    _in_a += 4;
-                    _in_b += 4;
-                }
+            _in_a += 4;
+            _in_b += 4;
+        }
  
-            realcacc = _mm_and_si128(realcacc, mask_real);
-            imagcacc = _mm_and_si128(imagcacc, mask_imag);
+        realcacc = _mm_and_si128(realcacc, mask_real);
+        imagcacc = _mm_and_si128(imagcacc, mask_imag);
  
-            a = _mm_or_si128(realcacc, imagcacc);
+        a = _mm_or_si128(realcacc, imagcacc);
  
-            _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
+        _mm_store_si128((__m128i*)dotProductVector,
+                        a); // Store the results back into the dot product vector
  
-            for (number = 0; number < 4; ++number)
-                {
-                    dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
-                }
+        for (number = 0; number < 4; ++number) {
+            dotProduct = lv_cmake(
+                sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
+                sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
          }
+    }
  
-    for (number = 0; number < (num_points % 4); ++number)
-        {
-            lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
-            dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
-        }
+    for (number = 0; number < (num_points % 4); ++number) {
+        lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
+        dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
+                              sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
+    }
  
      *_out = dotProduct;
  }
@@ -147,7 +160,10 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out,
+                                                     const lv_16sc_t* in_a,
+                                                     const lv_16sc_t* in_b,
+                                                     unsigned int num_points)
  {
      lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
  
@@ -158,62 +174,67 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, const lv_16
      lv_16sc_t* _out = out;
      unsigned int number;
  
-    if (sse_iters > 0)
-        {
-            __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
-            __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
+    if (sse_iters > 0) {
+        __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
+            realcacc, imagcacc, result;
+        __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
  
-            realcacc = _mm_setzero_si128();
-            imagcacc = _mm_setzero_si128();
+        realcacc = _mm_setzero_si128();
+        imagcacc = _mm_setzero_si128();
  
-            mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
-            mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
+        mask_imag = _mm_set_epi8(
+            0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
+        mask_real = _mm_set_epi8(
+            0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
  
-            for(number = 0; number < sse_iters; number++)
-                {
-                    // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
-                    a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
-                    __VOLK_PREFETCH(_in_a + 8);
-                    b = _mm_loadu_si128((__m128i*)_in_b);
-                    __VOLK_PREFETCH(_in_b + 8);
-                    c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
+        for (number = 0; number < sse_iters; number++) {
+            // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
+            a = _mm_loadu_si128(
+                (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+            __VOLK_PREFETCH(_in_a + 8);
+            b = _mm_loadu_si128((__m128i*)_in_b);
+            __VOLK_PREFETCH(_in_b + 8);
+            c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
  
-                    c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
-                    real = _mm_subs_epi16(c, c_sr);
+            c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
+                                         // zeros, and store the results in dst.
+            real = _mm_subs_epi16(c, c_sr);
  
-                    b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
-                    a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+            b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+            a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
  
-                    imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
-                    imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+            imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+            imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
  
-                    imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic!
+            imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
  
-                    realcacc = _mm_adds_epi16(realcacc, real);
-                    imagcacc = _mm_adds_epi16(imagcacc, imag);
+            realcacc = _mm_adds_epi16(realcacc, real);
+            imagcacc = _mm_adds_epi16(imagcacc, imag);
  
-                    _in_a += 4;
-                    _in_b += 4;
-                }
+            _in_a += 4;
+            _in_b += 4;
+        }
  
-            realcacc = _mm_and_si128(realcacc, mask_real);
-            imagcacc = _mm_and_si128(imagcacc, mask_imag);
+        realcacc = _mm_and_si128(realcacc, mask_real);
+        imagcacc = _mm_and_si128(imagcacc, mask_imag);
  
-            result = _mm_or_si128(realcacc, imagcacc);
+        result = _mm_or_si128(realcacc, imagcacc);
  
-            _mm_storeu_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector
+        _mm_storeu_si128((__m128i*)dotProductVector,
+                         result); // Store the results back into the dot product vector
  
-            for (number = 0; number < 4; ++number)
-                {
-                    dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
-                }
+        for (number = 0; number < 4; ++number) {
+            dotProduct = lv_cmake(
+                sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
+                sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
          }
+    }
  
-    for (number = 0; number < (num_points % 4); ++number)
-        {
-            lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
-            dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
-        }
+    for (number = 0; number < (num_points % 4); ++number) {
+        lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
+        dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
+                              sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
+    }
  
      *_out = dotProduct;
  }
@@ -223,7 +244,10 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, const lv_16
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out,
+                                                     const lv_16sc_t* in_a,
+                                                     const lv_16sc_t* in_b,
+                                                     unsigned int num_points)
  {
      lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
  
@@ -234,62 +258,126 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, const lv_16
      lv_16sc_t* _out = out;
      unsigned int number;
  
-    if (avx_iters > 0)
-        {
-            __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
-            __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
-
-            realcacc = _mm256_setzero_si256();
-            imagcacc = _mm256_setzero_si256();
-
-            mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
-            mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
-
-            for(number = 0; number < avx_iters; number++)
-                {
-                    a = _mm256_loadu_si256((__m256i*)_in_a);
-                    __VOLK_PREFETCH(_in_a + 16);
-                    b = _mm256_loadu_si256((__m256i*)_in_b);
-                    __VOLK_PREFETCH(_in_b + 16);
-                    c = _mm256_mullo_epi16(a, b);
-
-                    c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
-                    real = _mm256_subs_epi16(c, c_sr);
-
-                    b_sl = _mm256_slli_si256(b, 2);
-                    a_sl = _mm256_slli_si256(a, 2);
-
-                    imag1 = _mm256_mullo_epi16(a, b_sl);
-                    imag2 = _mm256_mullo_epi16(b, a_sl);
-
-                    imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic!
-
-                    realcacc = _mm256_adds_epi16(realcacc, real);
-                    imagcacc = _mm256_adds_epi16(imagcacc, imag);
-
-                    _in_a += 8;
-                    _in_b += 8;
-                }
+    if (avx_iters > 0) {
+        __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
+            realcacc, imagcacc, result;
+        __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
+
+        realcacc = _mm256_setzero_si256();
+        imagcacc = _mm256_setzero_si256();
+
+        mask_imag = _mm256_set_epi8(0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0);
+        mask_real = _mm256_set_epi8(0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF);
+
+        for (number = 0; number < avx_iters; number++) {
+            a = _mm256_loadu_si256((__m256i*)_in_a);
+            __VOLK_PREFETCH(_in_a + 16);
+            b = _mm256_loadu_si256((__m256i*)_in_b);
+            __VOLK_PREFETCH(_in_b + 16);
+            c = _mm256_mullo_epi16(a, b);
+
+            c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting
+                                            // in zeros, and store the results in dst.
+            real = _mm256_subs_epi16(c, c_sr);
+
+            b_sl = _mm256_slli_si256(b, 2);
+            a_sl = _mm256_slli_si256(a, 2);
+
+            imag1 = _mm256_mullo_epi16(a, b_sl);
+            imag2 = _mm256_mullo_epi16(b, a_sl);
+
+            imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
+
+            realcacc = _mm256_adds_epi16(realcacc, real);
+            imagcacc = _mm256_adds_epi16(imagcacc, imag);
+
+            _in_a += 8;
+            _in_b += 8;
+        }
  
-            realcacc = _mm256_and_si256(realcacc, mask_real);
-            imagcacc = _mm256_and_si256(imagcacc, mask_imag);
+        realcacc = _mm256_and_si256(realcacc, mask_real);
+        imagcacc = _mm256_and_si256(imagcacc, mask_imag);
  
-            result = _mm256_or_si256(realcacc, imagcacc);
+        result = _mm256_or_si256(realcacc, imagcacc);
  
-            _mm256_storeu_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
-            _mm256_zeroupper();
+        _mm256_storeu_si256((__m256i*)dotProductVector,
+                            result); // Store the results back into the dot product vector
+        _mm256_zeroupper();
  
-            for (number = 0; number < 8; ++number)
-                {
-                    dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
-                }
+        for (number = 0; number < 8; ++number) {
+            dotProduct = lv_cmake(
+                sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
+                sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
          }
+    }
  
-    for (number = 0; number < (num_points % 8); ++number)
-        {
-            lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
-            dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
-        }
+    for (number = 0; number < (num_points % 8); ++number) {
+        lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
+        dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
+                              sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
+    }
  
      *_out = dotProduct;
  }
@@ -299,7 +387,10 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, const lv_16
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out,
+                                                     const lv_16sc_t* in_a,
+                                                     const lv_16sc_t* in_b,
+                                                     unsigned int num_points)
  {
      lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
  
@@ -310,62 +401,126 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, const lv_16
      lv_16sc_t* _out = out;
      unsigned int number;
  
-    if (avx_iters > 0)
-        {
-            __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
-            __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
-
-            realcacc = _mm256_setzero_si256();
-            imagcacc = _mm256_setzero_si256();
-
-            mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
-            mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
-
-            for(number = 0; number < avx_iters; number++)
-                {
-                    a = _mm256_load_si256((__m256i*)_in_a);
-                    __VOLK_PREFETCH(_in_a + 16);
-                    b = _mm256_load_si256((__m256i*)_in_b);
-                    __VOLK_PREFETCH(_in_b + 16);
-                    c = _mm256_mullo_epi16(a, b);
-
-                    c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
-                    real = _mm256_subs_epi16(c, c_sr);
-
-                    b_sl = _mm256_slli_si256(b, 2);
-                    a_sl = _mm256_slli_si256(a, 2);
-
-                    imag1 = _mm256_mullo_epi16(a, b_sl);
-                    imag2 = _mm256_mullo_epi16(b, a_sl);
-
-                    imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic!
-
-                    realcacc = _mm256_adds_epi16(realcacc, real);
-                    imagcacc = _mm256_adds_epi16(imagcacc, imag);
-
-                    _in_a += 8;
-                    _in_b += 8;
-                }
+    if (avx_iters > 0) {
+        __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
+            realcacc, imagcacc, result;
+        __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
+
+        realcacc = _mm256_setzero_si256();
+        imagcacc = _mm256_setzero_si256();
+
+        mask_imag = _mm256_set_epi8(0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0);
+        mask_real = _mm256_set_epi8(0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF);
+
+        for (number = 0; number < avx_iters; number++) {
+            a = _mm256_load_si256((__m256i*)_in_a);
+            __VOLK_PREFETCH(_in_a + 16);
+            b = _mm256_load_si256((__m256i*)_in_b);
+            __VOLK_PREFETCH(_in_b + 16);
+            c = _mm256_mullo_epi16(a, b);
+
+            c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting
+                                            // in zeros, and store the results in dst.
+            real = _mm256_subs_epi16(c, c_sr);
+
+            b_sl = _mm256_slli_si256(b, 2);
+            a_sl = _mm256_slli_si256(a, 2);
+
+            imag1 = _mm256_mullo_epi16(a, b_sl);
+            imag2 = _mm256_mullo_epi16(b, a_sl);
+
+            imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
+
+            realcacc = _mm256_adds_epi16(realcacc, real);
+            imagcacc = _mm256_adds_epi16(imagcacc, imag);
+
+            _in_a += 8;
+            _in_b += 8;
+        }
  
-            realcacc = _mm256_and_si256(realcacc, mask_real);
-            imagcacc = _mm256_and_si256(imagcacc, mask_imag);
+        realcacc = _mm256_and_si256(realcacc, mask_real);
+        imagcacc = _mm256_and_si256(imagcacc, mask_imag);
  
-            result = _mm256_or_si256(realcacc, imagcacc);
+        result = _mm256_or_si256(realcacc, imagcacc);
  
-            _mm256_store_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
-            _mm256_zeroupper();
+        _mm256_store_si256((__m256i*)dotProductVector,
+                           result); // Store the results back into the dot product vector
+        _mm256_zeroupper();
  
-            for (number = 0; number < 8; ++number)
-                {
-                    dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
-                }
+        for (number = 0; number < 8; ++number) {
+            dotProduct = lv_cmake(
+                sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
+                sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
          }
+    }
  
-    for (number = 0; number < (num_points % 8); ++number)
-        {
-            lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
-            dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
-        }
+    for (number = 0; number < (num_points % 8); ++number) {
+        lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
+        dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
+                              sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
+    }
  
      *_out = dotProduct;
  }
@@ -375,69 +530,70 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, const lv_16
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out,
+                                                   const lv_16sc_t* in_a,
+                                                   const lv_16sc_t* in_b,
+                                                   unsigned int num_points)
  {
      unsigned int quarter_points = num_points / 4;
      unsigned int number;
  
-    lv_16sc_t* a_ptr = (lv_16sc_t*) in_a;
-    lv_16sc_t* b_ptr = (lv_16sc_t*) in_b;
+    lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
+    lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
      *out = lv_cmake((int16_t)0, (int16_t)0);
  
-    if (quarter_points > 0)
-        {
-            // for 2-lane vectors, 1st lane holds the real part,
-            // 2nd lane holds the imaginary part
-            int16x4x2_t a_val, b_val, c_val, accumulator;
-            int16x4x2_t tmp_real, tmp_imag;
-            __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
-            accumulator.val[0] = vdup_n_s16(0);
-            accumulator.val[1] = vdup_n_s16(0);
-            lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
-
-            for(number = 0; number < quarter_points; ++number)
-                {
-                    a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
-                    b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-                    __VOLK_PREFETCH(a_ptr + 8);
-                    __VOLK_PREFETCH(b_ptr + 8);
-
-                    // multiply the real*real and imag*imag to get real result
-                    // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
-                    tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
-                    // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
-                    tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
-
-                    // Multiply cross terms to get the imaginary result
-                    // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
-                    tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
-                    // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
-                    tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
-
-                    c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]);
-                    c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
-
-                    accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]);
-                    accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]);
-
-                    a_ptr += 4;
-                    b_ptr += 4;
-                }
-
-            vst2_s16((int16_t*)accum_result, accumulator);
-            for (number = 0; number < 4; ++number)
-                {
-                    dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[number])));
-                }
-
-            *out = dotProduct;
+    if (quarter_points > 0) {
+        // for 2-lane vectors, 1st lane holds the real part,
+        // 2nd lane holds the imaginary part
+        int16x4x2_t a_val, b_val, c_val, accumulator;
+        int16x4x2_t tmp_real, tmp_imag;
+        __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
+        accumulator.val[0] = vdup_n_s16(0);
+        accumulator.val[1] = vdup_n_s16(0);
+        lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
+
+        for (number = 0; number < quarter_points; ++number) {
+            a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+            b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+            __VOLK_PREFETCH(a_ptr + 8);
+            __VOLK_PREFETCH(b_ptr + 8);
+
+            // multiply the real*real and imag*imag to get real result
+            // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+            tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
+            // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+            tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
+
+            // Multiply cross terms to get the imaginary result
+            // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+            tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
+            // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+            tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
+
+            c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]);
+            c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
+
+            accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]);
+            accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]);
+
+            a_ptr += 4;
+            b_ptr += 4;
          }
  
-    // tail case
-    for(number = quarter_points * 4; number < num_points; ++number)
-        {
-            *out += (*a_ptr++) * (*b_ptr++);
+        vst2_s16((int16_t*)accum_result, accumulator);
+        for (number = 0; number < 4; ++number) {
+            dotProduct = lv_cmake(
+                sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[number])),
+                sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[number])));
          }
+
+        *out = dotProduct;
+    }
+
+    // tail case
+    for (number = quarter_points * 4; number < num_points; ++number) {
+        *out += (*a_ptr++) * (*b_ptr++);
+    }
  }
  
  #endif /* LV_HAVE_NEON */
@@ -446,13 +602,16 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const lv_16sc
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out,
+                                                       const lv_16sc_t* in_a,
+                                                       const lv_16sc_t* in_b,
+                                                       unsigned int num_points)
  {
      unsigned int quarter_points = num_points / 4;
      unsigned int number;
  
-    lv_16sc_t* a_ptr = (lv_16sc_t*) in_a;
-    lv_16sc_t* b_ptr = (lv_16sc_t*) in_b;
+    lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
+    lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
      // for 2-lane vectors, 1st lane holds the real part,
      // 2nd lane holds the imaginary part
      int16x4x2_t a_val, b_val, accumulator;
@@ -461,35 +620,33 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, const lv_
      accumulator.val[0] = vdup_n_s16(0);
      accumulator.val[1] = vdup_n_s16(0);
  
-    for(number = 0; number < quarter_points; ++number)
-        {
-            a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
-            b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-            __VOLK_PREFETCH(a_ptr + 8);
-            __VOLK_PREFETCH(b_ptr + 8);
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        __VOLK_PREFETCH(a_ptr + 8);
+        __VOLK_PREFETCH(b_ptr + 8);
  
-            tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
-            tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
+        tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
+        tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
  
-            // use multiply accumulate/subtract to get result
-            tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]);
-            tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]);
+        // use multiply accumulate/subtract to get result
+        tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]);
+        tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]);
  
-            accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]);
-            accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]);
+        accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]);
+        accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]);
  
-            a_ptr += 4;
-            b_ptr += 4;
-        }
+        a_ptr += 4;
+        b_ptr += 4;
+    }
  
      vst2_s16((int16_t*)accum_result, accumulator);
      *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
  
      // tail case
-    for(number = quarter_points * 4; number < num_points; ++number)
-        {
-            *out += (*a_ptr++) * (*b_ptr++);
-        }
+    for (number = quarter_points * 4; number < num_points; ++number) {
+        *out += (*a_ptr++) * (*b_ptr++);
+    }
  }
  
  #endif /* LV_HAVE_NEON */
@@ -498,13 +655,16 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, const lv_
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out,
+                                                          const lv_16sc_t* in_a,
+                                                          const lv_16sc_t* in_b,
+                                                          unsigned int num_points)
  {
      unsigned int quarter_points = num_points / 4;
      unsigned int number;
  
-    lv_16sc_t* a_ptr = (lv_16sc_t*) in_a;
-    lv_16sc_t* b_ptr = (lv_16sc_t*) in_b;
+    lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
+    lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
      // for 2-lane vectors, 1st lane holds the real part,
      // 2nd lane holds the imaginary part
      int16x4x2_t a_val, b_val, accumulator1, accumulator2;
@@ -515,22 +675,21 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, const
      accumulator2.val[0] = vdup_n_s16(0);
      accumulator2.val[1] = vdup_n_s16(0);
  
-    for(number = 0; number < quarter_points; ++number)
-        {
-            a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
-            b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-            __VOLK_PREFETCH(a_ptr + 8);
-            __VOLK_PREFETCH(b_ptr + 8);
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        __VOLK_PREFETCH(a_ptr + 8);
+        __VOLK_PREFETCH(b_ptr + 8);
  
-            // use 2 accumulators to remove inter-instruction data dependencies
-            accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]);
-            accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]);
-            accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]);
-            accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]);
+        // use 2 accumulators to remove inter-instruction data dependencies
+        accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]);
+        accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]);
+        accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]);
+        accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]);
  
-            a_ptr += 4;
-            b_ptr += 4;
-        }
+        a_ptr += 4;
+        b_ptr += 4;
+    }
  
      accumulator1.val[0] = vqadd_s16(accumulator1.val[0], accumulator2.val[0]);
      accumulator1.val[1] = vqadd_s16(accumulator1.val[1], accumulator2.val[1]);
@@ -539,10 +698,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, const
      *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
  
      // tail case
-    for(number = quarter_points * 4; number < num_points; ++number)
-        {
-            *out += (*a_ptr++) * (*b_ptr++);
-        }
+    for (number = quarter_points * 4; number < num_points; ++number) {
+        *out += (*a_ptr++) * (*b_ptr++);
+    }
  }
  
  #endif /* LV_HAVE_NEON */
diff --git a/kernels/volk/volk_16ic_x2_multiply_16ic.h b/kernels/volk/volk_16ic_x2_multiply_16ic.h

index 20d6a7febd49e91d577df6e7630a3fa2d393d505..2bf835dcc0c9fd72deb55435a5e44e18b0b1846c 100644 (file)
--- a/kernels/volk/volk_16ic_x2_multiply_16ic.h
+++ b/kernels/volk/volk_16ic_x2_multiply_16ic.h
@@ -25,18 +25,19 @@
   *
   * \b Overview
   *
- * Multiplies two input complex vectors, point-by-point, storing the result in the third vector.
- * WARNING: Saturation is not checked.
+ * Multiplies two input complex vectors, point-by-point, storing the result in the third
+ * vector. WARNING: Saturation is not checked.
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_16ic_x2_multiply_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points);
- * \endcode
+ * void volk_16ic_x2_multiply_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const
+ * lv_16sc_t* in_b, unsigned int num_points); \endcode
   *
   * \b Inputs
   * \li in_a: One of the vectors to be multiplied.
   * \li in_b: The other vector to be multiplied.
- * \li num_points: The number of complex data points to be multiplied from both input vectors.
+ * \li num_points: The number of complex data points to be multiplied from both input
+ * vectors.
   *
   * \b Outputs
   * \li result: The vector where the results will be stored.
@@ -51,13 +52,15 @@
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result,
+                                                      const lv_16sc_t* in_a,
+                                                      const lv_16sc_t* in_b,
+                                                      unsigned int num_points)
  {
      unsigned int n;
-    for (n = 0; n < num_points; n++)
-        {
-            result[n] = in_a[n] * in_b[n];
-        }
+    for (n = 0; n < num_points; n++) {
+        result[n] = in_a[n] * in_b[n];
+    }
  }
  
  #endif /*LV_HAVE_GENERIC*/
@@ -66,51 +69,58 @@ static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result, const l
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out,
+                                                     const lv_16sc_t* in_a,
+                                                     const lv_16sc_t* in_b,
+                                                     unsigned int num_points)
  {
      const unsigned int sse_iters = num_points / 4;
-    __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, result;
+    __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
+        result;
  
-    mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
-    mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
+    mask_imag = _mm_set_epi8(
+        0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
+    mask_real = _mm_set_epi8(
+        0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
  
      const lv_16sc_t* _in_a = in_a;
      const lv_16sc_t* _in_b = in_b;
      lv_16sc_t* _out = out;
      unsigned int number;
  
-    for(number = 0; number < sse_iters; number++)
-        {
-            a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
-            b = _mm_load_si128((__m128i*)_in_b);
-            c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
+    for (number = 0; number < sse_iters; number++) {
+        a = _mm_load_si128(
+            (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+        b = _mm_load_si128((__m128i*)_in_b);
+        c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
  
-            c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
-            real = _mm_subs_epi16 (c, c_sr);
-            real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0,  a3.r*b3.r- a3.i*b3.i
+        c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
+                                     // zeros, and store the results in dst.
+        real = _mm_subs_epi16(c, c_sr);
+        real = _mm_and_si128(real,
+                             mask_real); // a3.r*b3.r-a3.i*b3.i , 0,  a3.r*b3.r- a3.i*b3.i
  
-            b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
-            a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+        b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+        a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
  
-            imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
-            imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+        imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+        imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
  
-            imag = _mm_adds_epi16(imag1, imag2);
-            imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
+        imag = _mm_adds_epi16(imag1, imag2);
+        imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
  
-            result = _mm_or_si128 (real, imag);
+        result = _mm_or_si128(real, imag);
  
-            _mm_store_si128((__m128i*)_out, result);
+        _mm_store_si128((__m128i*)_out, result);
  
-            _in_a += 4;
-            _in_b += 4;
-            _out += 4;
-        }
+        _in_a += 4;
+        _in_b += 4;
+        _out += 4;
+    }
  
-    for (number = sse_iters * 4; number < num_points; ++number)
-        {
-            *_out++ = (*_in_a++) * (*_in_b++);
-        }
+    for (number = sse_iters * 4; number < num_points; ++number) {
+        *_out++ = (*_in_a++) * (*_in_b++);
+    }
  }
  #endif /* LV_HAVE_SSE2 */
  
@@ -118,51 +128,58 @@ static inline void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, const lv_16
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out,
+                                                     const lv_16sc_t* in_a,
+                                                     const lv_16sc_t* in_b,
+                                                     unsigned int num_points)
  {
      const unsigned int sse_iters = num_points / 4;
-    __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result;
+    __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
+        result;
  
-    mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
-    mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
+    mask_imag = _mm_set_epi8(
+        0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
+    mask_real = _mm_set_epi8(
+        0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
  
      const lv_16sc_t* _in_a = in_a;
      const lv_16sc_t* _in_b = in_b;
      lv_16sc_t* _out = out;
      unsigned int number;
  
-    for(number = 0; number < sse_iters; number++)
-        {
-            a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
-            b = _mm_loadu_si128((__m128i*)_in_b);
-            c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
+    for (number = 0; number < sse_iters; number++) {
+        a = _mm_loadu_si128(
+            (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+        b = _mm_loadu_si128((__m128i*)_in_b);
+        c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
  
-            c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
-            real = _mm_subs_epi16 (c, c_sr);
-            real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0,  a3.r*b3.r- a3.i*b3.i
+        c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
+                                     // zeros, and store the results in dst.
+        real = _mm_subs_epi16(c, c_sr);
+        real = _mm_and_si128(real,
+                             mask_real); // a3.r*b3.r-a3.i*b3.i , 0,  a3.r*b3.r- a3.i*b3.i
  
-            b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
-            a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+        b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+        a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
  
-            imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
-            imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+        imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+        imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
  
-            imag = _mm_adds_epi16(imag1, imag2);
-            imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
+        imag = _mm_adds_epi16(imag1, imag2);
+        imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
  
-            result = _mm_or_si128 (real, imag);
+        result = _mm_or_si128(real, imag);
  
-            _mm_storeu_si128((__m128i*)_out, result);
+        _mm_storeu_si128((__m128i*)_out, result);
  
-            _in_a += 4;
-            _in_b += 4;
-            _out += 4;
-        }
+        _in_a += 4;
+        _in_b += 4;
+        _out += 4;
+    }
  
-    for (number = sse_iters * 4; number < num_points; ++number)
-        {
-            *_out++ = (*_in_a++) * (*_in_b++);
-        }
+    for (number = sse_iters * 4; number < num_points; ++number) {
+        *_out++ = (*_in_a++) * (*_in_b++);
+    }
  }
  #endif /* LV_HAVE_SSE2 */
  
@@ -170,7 +187,10 @@ static inline void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, const lv_16
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out,
+                                                     const lv_16sc_t* in_a,
+                                                     const lv_16sc_t* in_b,
+                                                     unsigned int num_points)
  {
      unsigned int number = 0;
      const unsigned int avx2_points = num_points / 8;
@@ -179,44 +199,108 @@ static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, const lv_16
      const lv_16sc_t* _in_b = in_b;
      lv_16sc_t* _out = out;
  
-    __m256i a, b, c, c_sr,  real, imag, imag1, imag2, b_sl, a_sl, result;
-
-    const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
-    const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
-
-    for(;number < avx2_points; number++)
-        {
-            a = _mm256_loadu_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
-            b = _mm256_loadu_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
-            c = _mm256_mullo_epi16(a, b);
-
-            c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
-            real = _mm256_subs_epi16(c, c_sr);
-            real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0,  a3.r*b3.r- a3.i*b3.i
-
-            b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
-            a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
-
-            imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
-            imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
-
-            imag = _mm256_adds_epi16(imag1, imag2);
-            imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
-
-            result = _mm256_or_si256(real, imag);
-
-            _mm256_storeu_si256((__m256i*)_out, result);
-
-            _in_a += 8;
-            _in_b += 8;
-            _out += 8;
-        }
+    __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
+
+    const __m256i mask_imag = _mm256_set_epi8(0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0);
+    const __m256i mask_real = _mm256_set_epi8(0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF);
+
+    for (; number < avx2_points; number++) {
+        a = _mm256_loadu_si256(
+            (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
+        b = _mm256_loadu_si256(
+            (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
+        c = _mm256_mullo_epi16(a, b);
+
+        c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in
+                                        // zeros, and store the results in dst.
+        real = _mm256_subs_epi16(c, c_sr);
+        real = _mm256_and_si256(
+            real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0,  a3.r*b3.r- a3.i*b3.i
+
+        b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
+        a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
+
+        imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+        imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+        imag = _mm256_adds_epi16(imag1, imag2);
+        imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
+
+        result = _mm256_or_si256(real, imag);
+
+        _mm256_storeu_si256((__m256i*)_out, result);
+
+        _in_a += 8;
+        _in_b += 8;
+        _out += 8;
+    }
      _mm256_zeroupper();
      number = avx2_points * 8;
-    for(;number < num_points; number++)
-        {
-            *_out++ = (*_in_a++) * (*_in_b++);
-        }
+    for (; number < num_points; number++) {
+        *_out++ = (*_in_a++) * (*_in_b++);
+    }
  }
  #endif /* LV_HAVE_AVX2  */
  
@@ -224,7 +308,10 @@ static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, const lv_16
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out,
+                                                     const lv_16sc_t* in_a,
+                                                     const lv_16sc_t* in_b,
+                                                     unsigned int num_points)
  {
      unsigned int number = 0;
      const unsigned int avx2_points = num_points / 8;
@@ -233,44 +320,108 @@ static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, const lv_16
      const lv_16sc_t* _in_b = in_b;
      lv_16sc_t* _out = out;
  
-    __m256i a, b, c, c_sr,  real, imag, imag1, imag2, b_sl, a_sl, result;
-
-    const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
-    const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
-
-    for(;number < avx2_points; number++)
-        {
-            a = _mm256_load_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
-            b = _mm256_load_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
-            c = _mm256_mullo_epi16(a, b);
-
-            c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
-            real = _mm256_subs_epi16(c, c_sr);
-            real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0,  a3.r*b3.r- a3.i*b3.i
-
-            b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
-            a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
-
-            imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
-            imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
-
-            imag = _mm256_adds_epi16(imag1, imag2);
-            imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
-
-            result = _mm256_or_si256(real, imag);
-
-            _mm256_store_si256((__m256i*)_out, result);
-
-            _in_a += 8;
-            _in_b += 8;
-            _out += 8;
-        }
+    __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
+
+    const __m256i mask_imag = _mm256_set_epi8(0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0);
+    const __m256i mask_real = _mm256_set_epi8(0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF);
+
+    for (; number < avx2_points; number++) {
+        a = _mm256_load_si256(
+            (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
+        b = _mm256_load_si256(
+            (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
+        c = _mm256_mullo_epi16(a, b);
+
+        c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in
+                                        // zeros, and store the results in dst.
+        real = _mm256_subs_epi16(c, c_sr);
+        real = _mm256_and_si256(
+            real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0,  a3.r*b3.r- a3.i*b3.i
+
+        b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
+        a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
+
+        imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+        imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+        imag = _mm256_adds_epi16(imag1, imag2);
+        imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
+
+        result = _mm256_or_si256(real, imag);
+
+        _mm256_store_si256((__m256i*)_out, result);
+
+        _in_a += 8;
+        _in_b += 8;
+        _out += 8;
+    }
      _mm256_zeroupper();
      number = avx2_points * 8;
-    for(;number < num_points; number++)
-        {
-            *_out++ = (*_in_a++) * (*_in_b++);
-        }
+    for (; number < num_points; number++) {
+        *_out++ = (*_in_a++) * (*_in_b++);
+    }
  }
  #endif /* LV_HAVE_AVX2  */
  
@@ -278,48 +429,49 @@ static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, const lv_16
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out,
+                                                   const lv_16sc_t* in_a,
+                                                   const lv_16sc_t* in_b,
+                                                   unsigned int num_points)
  {
-    lv_16sc_t *a_ptr = (lv_16sc_t*) in_a;
-    lv_16sc_t *b_ptr = (lv_16sc_t*) in_b;
+    lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
+    lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
      unsigned int quarter_points = num_points / 4;
      int16x4x2_t a_val, b_val, c_val;
      int16x4x2_t tmp_real, tmp_imag;
      unsigned int number = 0;
  
-    for(number = 0; number < quarter_points; ++number)
-        {
-            a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
-            b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-            __VOLK_PREFETCH(a_ptr + 4);
-            __VOLK_PREFETCH(b_ptr + 4);
-
-            // multiply the real*real and imag*imag to get real result
-            // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
-            tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
-            // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
-            tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
-
-            // Multiply cross terms to get the imaginary result
-            // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
-            tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
-            // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
-            tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
-
-            // store the results
-            c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]);
-            c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
-            vst2_s16((int16_t*)out, c_val);
-
-            a_ptr += 4;
-            b_ptr += 4;
-            out += 4;
-        }
-
-    for(number = quarter_points * 4; number < num_points; number++)
-        {
-            *out++ = (*a_ptr++) * (*b_ptr++);
-        }
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        __VOLK_PREFETCH(a_ptr + 4);
+        __VOLK_PREFETCH(b_ptr + 4);
+
+        // multiply the real*real and imag*imag to get real result
+        // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+        tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
+        // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+        tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
+
+        // Multiply cross terms to get the imaginary result
+        // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+        tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
+        // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+        tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
+
+        // store the results
+        c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]);
+        c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
+        vst2_s16((int16_t*)out, c_val);
+
+        a_ptr += 4;
+        b_ptr += 4;
+        out += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *out++ = (*a_ptr++) * (*b_ptr++);
+    }
  }
  #endif /* LV_HAVE_NEON */
  
diff --git a/kernels/volk/volk_16u_byteswap.h b/kernels/volk/volk_16u_byteswap.h

index eaa972f17511618513b49e55b760a58b29dab32a..221dcdbfd825d472680d49d60eb8521026dd517b 100644 (file)
--- a/kernels/volk/volk_16u_byteswap.h
+++ b/kernels/volk/volk_16u_byteswap.h
@@ -58,74 +58,80 @@
  
  #if LV_HAVE_AVX2
  #include <immintrin.h>
-static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int num_points){
-  unsigned int number;
+static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int num_points)
+{
+    unsigned int number;
  
-  const unsigned int nPerSet   = 16;
-  const uint64_t     nSets   = num_points / nPerSet;
+    const unsigned int nPerSet = 16;
+    const uint64_t nSets = num_points / nPerSet;
  
-  uint16_t* inputPtr = (uint16_t*) intsToSwap;
+    uint16_t* inputPtr = (uint16_t*)intsToSwap;
  
-  const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30};
+    const uint8_t shuffleVector[32] = { 1,  0,  3,  2,  5,  4,  7,  6,  9,  8,  11,
+                                        10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
+                                        23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
  
-  const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]);
+    const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
  
-  for(number = 0; number < nSets; number++) {
-    // Load the 32t values, increment inputPtr later since we're doing it in-place.
-    const __m256i input  = _mm256_load_si256((__m256i*)inputPtr);
-    const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
+    for (number = 0; number < nSets; number++) {
+        // Load the 32t values, increment inputPtr later since we're doing it in-place.
+        const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
+        const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
  
-    // Store the results
-    _mm256_store_si256((__m256i*)inputPtr, output);
-    inputPtr += nPerSet;
-  }
+        // Store the results
+        _mm256_store_si256((__m256i*)inputPtr, output);
+        inputPtr += nPerSet;
+    }
  
-  _mm256_zeroupper();
+    _mm256_zeroupper();
  
-  // Byteswap any remaining points:
-  for(number = nPerSet * nSets; number < num_points; number++) {
-    uint16_t outputVal = *inputPtr;
-    outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
-    *inputPtr = outputVal;
-    inputPtr++;
-  }
+    // Byteswap any remaining points:
+    for (number = nPerSet * nSets; number < num_points; number++) {
+        uint16_t outputVal = *inputPtr;
+        outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+        *inputPtr = outputVal;
+        inputPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
  
  #if LV_HAVE_AVX2
  #include <immintrin.h>
-static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int num_points){
-  unsigned int number;
+static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int num_points)
+{
+    unsigned int number;
  
-  const unsigned int nPerSet   = 16;
-  const uint64_t     nSets   = num_points / nPerSet;
+    const unsigned int nPerSet = 16;
+    const uint64_t nSets = num_points / nPerSet;
  
-  uint16_t* inputPtr = (uint16_t*) intsToSwap;
+    uint16_t* inputPtr = (uint16_t*)intsToSwap;
  
-  const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30};
+    const uint8_t shuffleVector[32] = { 1,  0,  3,  2,  5,  4,  7,  6,  9,  8,  11,
+                                        10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
+                                        23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
  
-  const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]);
+    const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
  
-  for (number = 0; number < nSets; number++) {
-    // Load the 32t values, increment inputPtr later since we're doing it in-place.
-    const __m256i input  = _mm256_loadu_si256((__m256i*)inputPtr);
-    const __m256i output = _mm256_shuffle_epi8(input,myShuffle);
+    for (number = 0; number < nSets; number++) {
+        // Load the 32t values, increment inputPtr later since we're doing it in-place.
+        const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
+        const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
  
-    // Store the results
-    _mm256_storeu_si256((__m256i*)inputPtr, output);
-    inputPtr += nPerSet;
-  }
+        // Store the results
+        _mm256_storeu_si256((__m256i*)inputPtr, output);
+        inputPtr += nPerSet;
+    }
  
-  _mm256_zeroupper();
+    _mm256_zeroupper();
  
-  // Byteswap any remaining points:
-  for(number = nPerSet * nSets; number < num_points; number++) {
-    uint16_t outputVal = *inputPtr;
-    outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
-    *inputPtr = outputVal;
-    inputPtr++;
-  }
+    // Byteswap any remaining points:
+    for (number = nPerSet * nSets; number < num_points; number++) {
+        uint16_t outputVal = *inputPtr;
+        outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+        *inputPtr = outputVal;
+        inputPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -133,47 +139,50 @@ static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int n
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){
-  unsigned int number = 0;
-  uint16_t* inputPtr = intsToSwap;
-  __m128i input, left, right, output;
-
-  const unsigned int eighthPoints = num_points / 8;
-  for(;number < eighthPoints; number++){
-    // Load the 16t values, increment inputPtr later since we're doing it in-place.
-    input = _mm_loadu_si128((__m128i*)inputPtr);
-    // Do the two shifts
-    left = _mm_slli_epi16(input, 8);
-    right = _mm_srli_epi16(input, 8);
-    // Or the left and right halves together
-    output = _mm_or_si128(left, right);
-    // Store the results
-    _mm_storeu_si128((__m128i*)inputPtr, output);
-    inputPtr += 8;
-  }
-
-  // Byteswap any remaining points:
-  number = eighthPoints*8;
-  for(; number < num_points; number++){
-    uint16_t outputVal = *inputPtr;
-    outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
-    *inputPtr = outputVal;
-    inputPtr++;
-  }
+static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points)
+{
+    unsigned int number = 0;
+    uint16_t* inputPtr = intsToSwap;
+    __m128i input, left, right, output;
+
+    const unsigned int eighthPoints = num_points / 8;
+    for (; number < eighthPoints; number++) {
+        // Load the 16t values, increment inputPtr later since we're doing it in-place.
+        input = _mm_loadu_si128((__m128i*)inputPtr);
+        // Do the two shifts
+        left = _mm_slli_epi16(input, 8);
+        right = _mm_srli_epi16(input, 8);
+        // Or the left and right halves together
+        output = _mm_or_si128(left, right);
+        // Store the results
+        _mm_storeu_si128((__m128i*)inputPtr, output);
+        inputPtr += 8;
+    }
+
+    // Byteswap any remaining points:
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        uint16_t outputVal = *inputPtr;
+        outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+        *inputPtr = outputVal;
+        inputPtr++;
+    }
  }
  #endif /* LV_HAVE_SSE2 */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap, unsigned int num_points){
-  unsigned int point;
-  uint16_t* inputPtr = intsToSwap;
-  for(point = 0; point < num_points; point++){
-    uint16_t output = *inputPtr;
-    output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
-    *inputPtr = output;
-    inputPtr++;
-  }
+static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap,
+                                             unsigned int num_points)
+{
+    unsigned int point;
+    uint16_t* inputPtr = intsToSwap;
+    for (point = 0; point < num_points; point++) {
+        uint16_t output = *inputPtr;
+        output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+        *inputPtr = output;
+        inputPtr++;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -187,129 +196,136 @@ static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap, unsigned int
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points){
-  unsigned int number = 0;
-  uint16_t* inputPtr = intsToSwap;
-  __m128i input, left, right, output;
-
-  const unsigned int eighthPoints = num_points / 8;
-  for(;number < eighthPoints; number++){
-    // Load the 16t values, increment inputPtr later since we're doing it in-place.
-    input = _mm_load_si128((__m128i*)inputPtr);
-    // Do the two shifts
-    left = _mm_slli_epi16(input, 8);
-    right = _mm_srli_epi16(input, 8);
-    // Or the left and right halves together
-    output = _mm_or_si128(left, right);
-    // Store the results
-    _mm_store_si128((__m128i*)inputPtr, output);
-    inputPtr += 8;
-  }
-
-
-  // Byteswap any remaining points:
-  number = eighthPoints*8;
-  for(; number < num_points; number++){
-    uint16_t outputVal = *inputPtr;
-    outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
-    *inputPtr = outputVal;
-    inputPtr++;
-  }
+static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points)
+{
+    unsigned int number = 0;
+    uint16_t* inputPtr = intsToSwap;
+    __m128i input, left, right, output;
+
+    const unsigned int eighthPoints = num_points / 8;
+    for (; number < eighthPoints; number++) {
+        // Load the 16t values, increment inputPtr later since we're doing it in-place.
+        input = _mm_load_si128((__m128i*)inputPtr);
+        // Do the two shifts
+        left = _mm_slli_epi16(input, 8);
+        right = _mm_srli_epi16(input, 8);
+        // Or the left and right halves together
+        output = _mm_or_si128(left, right);
+        // Store the results
+        _mm_store_si128((__m128i*)inputPtr, output);
+        inputPtr += 8;
+    }
+
+
+    // Byteswap any remaining points:
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        uint16_t outputVal = *inputPtr;
+        outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+        *inputPtr = outputVal;
+        inputPtr++;
+    }
  }
  #endif /* LV_HAVE_SSE2 */
  
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points){
-  unsigned int number;
-  unsigned int eighth_points = num_points / 8;
-  uint16x8_t input, output;
-  uint16_t* inputPtr = intsToSwap;
-
-  for(number = 0; number < eighth_points; number++) {
-    input = vld1q_u16(inputPtr);
-    output = vsriq_n_u16(output, input, 8);
-    output = vsliq_n_u16(output, input, 8);
-    vst1q_u16(inputPtr, output);
-    inputPtr += 8;
-  }
-
-  for(number = eighth_points * 8; number < num_points; number++){
-    uint16_t output = *inputPtr;
-    output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
-    *inputPtr = output;
-    inputPtr++;
-  }
+static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points)
+{
+    unsigned int number;
+    unsigned int eighth_points = num_points / 8;
+    uint16x8_t input, output;
+    uint16_t* inputPtr = intsToSwap;
+
+    for (number = 0; number < eighth_points; number++) {
+        input = vld1q_u16(inputPtr);
+        output = vsriq_n_u16(output, input, 8);
+        output = vsliq_n_u16(output, input, 8);
+        vst1q_u16(inputPtr, output);
+        inputPtr += 8;
+    }
+
+    for (number = eighth_points * 8; number < num_points; number++) {
+        uint16_t output = *inputPtr;
+        output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+        *inputPtr = output;
+        inputPtr++;
+    }
  }
  #endif /* LV_HAVE_NEON */
  
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap, unsigned int num_points){
-  uint16_t* inputPtr = intsToSwap;
-  unsigned int number = 0;
-  unsigned int n16points = num_points / 16;
-
-  uint8x8x4_t input_table;
-  uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
-  uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
-
-  /* these magic numbers are used as byte-indices in the LUT.
-     they are pre-computed to save time. A simple C program
-     can calculate them; for example for lookup01:
-    uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
-    for(ii=0; ii < 8; ++ii) {
-        index += ((uint64_t)(*(chars+ii))) << (ii*8);
+static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap,
+                                                unsigned int num_points)
+{
+    uint16_t* inputPtr = intsToSwap;
+    unsigned int number = 0;
+    unsigned int n16points = num_points / 16;
+
+    uint8x8x4_t input_table;
+    uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
+    uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
+
+    /* these magic numbers are used as byte-indices in the LUT.
+       they are pre-computed to save time. A simple C program
+       can calculate them; for example for lookup01:
+      uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
+      for(ii=0; ii < 8; ++ii) {
+          index += ((uint64_t)(*(chars+ii))) << (ii*8);
+      }
+    */
+    int_lookup01 = vcreate_u8(1232017111498883080);
+    int_lookup23 = vcreate_u8(1376697457175036426);
+    int_lookup45 = vcreate_u8(1521377802851189772);
+    int_lookup67 = vcreate_u8(1666058148527343118);
+
+    for (number = 0; number < n16points; ++number) {
+        input_table = vld4_u8((uint8_t*)inputPtr);
+        swapped_int01 = vtbl4_u8(input_table, int_lookup01);
+        swapped_int23 = vtbl4_u8(input_table, int_lookup23);
+        swapped_int45 = vtbl4_u8(input_table, int_lookup45);
+        swapped_int67 = vtbl4_u8(input_table, int_lookup67);
+        vst1_u8((uint8_t*)inputPtr, swapped_int01);
+        vst1_u8((uint8_t*)(inputPtr + 4), swapped_int23);
+        vst1_u8((uint8_t*)(inputPtr + 8), swapped_int45);
+        vst1_u8((uint8_t*)(inputPtr + 12), swapped_int67);
+
+        inputPtr += 16;
+    }
+
+    for (number = n16points * 16; number < num_points; ++number) {
+        uint16_t output = *inputPtr;
+        output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+        *inputPtr = output;
+        inputPtr++;
      }
-  */
-  int_lookup01 = vcreate_u8(1232017111498883080);
-  int_lookup23 = vcreate_u8(1376697457175036426);
-  int_lookup45 = vcreate_u8(1521377802851189772);
-  int_lookup67 = vcreate_u8(1666058148527343118);
-
-  for(number = 0; number < n16points; ++number){
-    input_table = vld4_u8((uint8_t*) inputPtr);
-    swapped_int01 = vtbl4_u8(input_table, int_lookup01);
-    swapped_int23 = vtbl4_u8(input_table, int_lookup23);
-    swapped_int45 = vtbl4_u8(input_table, int_lookup45);
-    swapped_int67 = vtbl4_u8(input_table, int_lookup67);
-    vst1_u8((uint8_t*)inputPtr, swapped_int01);
-    vst1_u8((uint8_t*)(inputPtr+4), swapped_int23);
-    vst1_u8((uint8_t*)(inputPtr+8), swapped_int45);
-    vst1_u8((uint8_t*)(inputPtr+12), swapped_int67);
-
-    inputPtr += 16;
-  }
-
-  for(number = n16points * 16; number < num_points; ++number){
-    uint16_t output = *inputPtr;
-    output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
-    *inputPtr = output;
-    inputPtr++;
-  }
  }
  #endif /* LV_HAVE_NEON */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap, unsigned int num_points){
-  unsigned int point;
-  uint16_t* inputPtr = intsToSwap;
-  for(point = 0; point < num_points; point++){
-    uint16_t output = *inputPtr;
-    output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
-    *inputPtr = output;
-    inputPtr++;
-  }
+static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap,
+                                               unsigned int num_points)
+{
+    unsigned int point;
+    uint16_t* inputPtr = intsToSwap;
+    for (point = 0; point < num_points; point++) {
+        uint16_t output = *inputPtr;
+        output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+        *inputPtr = output;
+        inputPtr++;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  #ifdef LV_HAVE_ORC
  
  extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, unsigned int num_points);
-static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points){
+static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points)
+{
      volk_16u_byteswap_a_orc_impl(intsToSwap, num_points);
  }
  #endif /* LV_HAVE_ORC */
diff --git a/kernels/volk/volk_16u_byteswappuppet_16u.h b/kernels/volk/volk_16u_byteswappuppet_16u.h

index d3c8c5d007b697877d14629b46ececf1452a33be..8cb13187b0e76e739c87593afc0b8087bf67422c 100644 (file)
--- a/kernels/volk/volk_16u_byteswappuppet_16u.h
+++ b/kernels/volk/volk_16u_byteswappuppet_16u.h
@@ -3,69 +3,83 @@
  
  
  #include <stdint.h>
-#include <volk/volk_16u_byteswap.h>
  #include <string.h>
+#include <volk/volk_16u_byteswap.h>
  
  #ifdef LV_HAVE_GENERIC
-static inline void volk_16u_byteswappuppet_16u_generic(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){
+static inline void volk_16u_byteswappuppet_16u_generic(uint16_t* output,
+                                                       uint16_t* intsToSwap,
+                                                       unsigned int num_points)
+{
  
      volk_16u_byteswap_generic((uint16_t*)intsToSwap, num_points);
      memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
-
  }
  #endif
  
  #ifdef LV_HAVE_NEON
-static inline void volk_16u_byteswappuppet_16u_neon(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){
+static inline void volk_16u_byteswappuppet_16u_neon(uint16_t* output,
+                                                    uint16_t* intsToSwap,
+                                                    unsigned int num_points)
+{
  
      volk_16u_byteswap_neon((uint16_t*)intsToSwap, num_points);
      memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
-
  }
  #endif
  
  #ifdef LV_HAVE_NEON
-static inline void volk_16u_byteswappuppet_16u_neon_table(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){
+static inline void volk_16u_byteswappuppet_16u_neon_table(uint16_t* output,
+                                                          uint16_t* intsToSwap,
+                                                          unsigned int num_points)
+{
  
      volk_16u_byteswap_neon_table((uint16_t*)intsToSwap, num_points);
      memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
-
  }
  #endif
  
  #ifdef LV_HAVE_SSE2
-static inline void volk_16u_byteswappuppet_16u_u_sse2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){
+static inline void volk_16u_byteswappuppet_16u_u_sse2(uint16_t* output,
+                                                      uint16_t* intsToSwap,
+                                                      unsigned int num_points)
+{
  
      volk_16u_byteswap_u_sse2((uint16_t*)intsToSwap, num_points);
      memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
-
  }
  #endif
  
  #ifdef LV_HAVE_SSE2
-static inline void volk_16u_byteswappuppet_16u_a_sse2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){
+static inline void volk_16u_byteswappuppet_16u_a_sse2(uint16_t* output,
+                                                      uint16_t* intsToSwap,
+                                                      unsigned int num_points)
+{
  
      volk_16u_byteswap_a_sse2((uint16_t*)intsToSwap, num_points);
      memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
-
  }
  #endif
  
  #ifdef LV_HAVE_AVX2
-static inline void volk_16u_byteswappuppet_16u_u_avx2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){
+static inline void volk_16u_byteswappuppet_16u_u_avx2(uint16_t* output,
+                                                      uint16_t* intsToSwap,
+                                                      unsigned int num_points)
+{
  
      volk_16u_byteswap_u_avx2((uint16_t*)intsToSwap, num_points);
      memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
-
  }
  #endif
  
  #ifdef LV_HAVE_AVX2
-static inline void volk_16u_byteswappuppet_16u_a_avx2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){
+static inline void volk_16u_byteswappuppet_16u_a_avx2(uint16_t* output,
+                                                      uint16_t* intsToSwap,
+                                                      unsigned int num_points)
+{
  
      volk_16u_byteswap_a_avx2((uint16_t*)intsToSwap, num_points);
      memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
-
  }
  #endif
  
diff --git a/kernels/volk/volk_32f_64f_add_64f.h b/kernels/volk/volk_32f_64f_add_64f.h

index 770c27ef49e5738abb53a6426b69727eaca68ec2..d00ada5d29d65b218307fbc113436b03d0145157 100644 (file)
--- a/kernels/volk/volk_32f_64f_add_64f.h
+++ b/kernels/volk/volk_32f_64f_add_64f.h
@@ -77,18 +77,19 @@
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_32f_64f_add_64f_generic(double *cVector,
-                                                const float *aVector,
-                                                const double *bVector,
-                                                unsigned int num_points) {
-  double *cPtr = cVector;
-  const float *aPtr = aVector;
-  const double *bPtr = bVector;
-  unsigned int number = 0;
-
-  for (number = 0; number < num_points; number++) {
-    *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
-  }
+static inline void volk_32f_64f_add_64f_generic(double* cVector,
+                                                const float* aVector,
+                                                const double* bVector,
+                                                unsigned int num_points)
+{
+    double* cPtr = cVector;
+    const float* aPtr = aVector;
+    const double* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_GENERIC */
@@ -96,42 +97,43 @@ static inline void volk_32f_64f_add_64f_generic(double *cVector,
  #ifdef LV_HAVE_NEONV8
  #include <arm_neon.h>
  
-static inline void volk_32f_64f_add_64f_neon(double *cVector,
-                                             const float *aVector,
-                                             const double *bVector,
-                                             unsigned int num_points) {
-  unsigned int number = 0;
-  const unsigned int half_points = num_points / 2;
-
-  double *cPtr = cVector;
-  const float *aPtr = aVector;
-  const double *bPtr = bVector;
-
-  float64x2_t aVal, bVal, cVal;
-  float32x2_t aVal1;
-  for (number = 0; number < half_points; number++) {
-    // Load in to NEON registers
-    aVal1 = vld1_f32(aPtr);
-    bVal = vld1q_f64(bPtr);
-    __VOLK_PREFETCH(aPtr + 2);
-    __VOLK_PREFETCH(bPtr + 2);
-    aPtr += 2; // q uses quadwords, 4 floats per vadd
-    bPtr += 2;
-
-    // Vector conversion
-    aVal = vcvt_f64_f32(aVal1);
-    // vector add
-    cVal = vaddq_f64(aVal, bVal);
-    // Store the results back into the C container
-    vst1q_f64(cPtr, cVal);
-
-    cPtr += 2;
-  }
-
-  number = half_points * 2; // should be = num_points
-  for (; number < num_points; number++) {
-    *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
-  }
+static inline void volk_32f_64f_add_64f_neon(double* cVector,
+                                             const float* aVector,
+                                             const double* bVector,
+                                             unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int half_points = num_points / 2;
+
+    double* cPtr = cVector;
+    const float* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    float64x2_t aVal, bVal, cVal;
+    float32x2_t aVal1;
+    for (number = 0; number < half_points; number++) {
+        // Load in to NEON registers
+        aVal1 = vld1_f32(aPtr);
+        bVal = vld1q_f64(bPtr);
+        __VOLK_PREFETCH(aPtr + 2);
+        __VOLK_PREFETCH(bPtr + 2);
+        aPtr += 2; // q uses quadwords, 4 floats per vadd
+        bPtr += 2;
+
+        // Vector conversion
+        aVal = vcvt_f64_f32(aVal1);
+        // vector add
+        cVal = vaddq_f64(aVal, bVal);
+        // Store the results back into the C container
+        vst1q_f64(cPtr, cVal);
+
+        cPtr += 2;
+    }
+
+    number = half_points * 2; // should be = num_points
+    for (; number < num_points; number++) {
+        *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_NEONV8 */
@@ -141,49 +143,50 @@ static inline void volk_32f_64f_add_64f_neon(double *cVector,
  #include <immintrin.h>
  #include <xmmintrin.h>
  
-static inline void volk_32f_64f_add_64f_u_avx(double *cVector,
-                                              const float *aVector,
-                                              const double *bVector,
-                                              unsigned int num_points) {
-  unsigned int number = 0;
-  const unsigned int eighth_points = num_points / 8;
-
-  double *cPtr = cVector;
-  const float *aPtr = aVector;
-  const double *bPtr = bVector;
-
-  __m256 aVal;
-  __m128 aVal1, aVal2;
-  __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
-  for (; number < eighth_points; number++) {
-
-    aVal = _mm256_loadu_ps(aPtr);
-    bVal1 = _mm256_loadu_pd(bPtr);
-    bVal2 = _mm256_loadu_pd(bPtr + 4);
-
-    aVal1 = _mm256_extractf128_ps(aVal, 0);
-    aVal2 = _mm256_extractf128_ps(aVal, 1);
-
-    aDbl1 = _mm256_cvtps_pd(aVal1);
-    aDbl2 = _mm256_cvtps_pd(aVal2);
-
-    cVal1 = _mm256_add_pd(aDbl1, bVal1);
-    cVal2 = _mm256_add_pd(aDbl2, bVal2);
-
-    _mm256_storeu_pd(cPtr,
-                     cVal1); // Store the results back into the C container
-    _mm256_storeu_pd(cPtr + 4,
-                     cVal2); // Store the results back into the C container
-
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
-
-  number = eighth_points * 8;
-  for (; number < num_points; number++) {
-    *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
-  }
+static inline void volk_32f_64f_add_64f_u_avx(double* cVector,
+                                              const float* aVector,
+                                              const double* bVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighth_points = num_points / 8;
+
+    double* cPtr = cVector;
+    const float* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    __m256 aVal;
+    __m128 aVal1, aVal2;
+    __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
+    for (; number < eighth_points; number++) {
+
+        aVal = _mm256_loadu_ps(aPtr);
+        bVal1 = _mm256_loadu_pd(bPtr);
+        bVal2 = _mm256_loadu_pd(bPtr + 4);
+
+        aVal1 = _mm256_extractf128_ps(aVal, 0);
+        aVal2 = _mm256_extractf128_ps(aVal, 1);
+
+        aDbl1 = _mm256_cvtps_pd(aVal1);
+        aDbl2 = _mm256_cvtps_pd(aVal2);
+
+        cVal1 = _mm256_add_pd(aDbl1, bVal1);
+        cVal2 = _mm256_add_pd(aDbl2, bVal2);
+
+        _mm256_storeu_pd(cPtr,
+                         cVal1); // Store the results back into the C container
+        _mm256_storeu_pd(cPtr + 4,
+                         cVal2); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighth_points * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX */
@@ -193,48 +196,49 @@ static inline void volk_32f_64f_add_64f_u_avx(double *cVector,
  #include <immintrin.h>
  #include <xmmintrin.h>
  
-static inline void volk_32f_64f_add_64f_a_avx(double *cVector,
-                                              const float *aVector,
-                                              const double *bVector,
-                                              unsigned int num_points) {
-  unsigned int number = 0;
-  const unsigned int eighth_points = num_points / 8;
-
-  double *cPtr = cVector;
-  const float *aPtr = aVector;
-  const double *bPtr = bVector;
-
-  __m256 aVal;
-  __m128 aVal1, aVal2;
-  __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
-  for (; number < eighth_points; number++) {
-
-    aVal = _mm256_load_ps(aPtr);
-    bVal1 = _mm256_load_pd(bPtr);
-    bVal2 = _mm256_load_pd(bPtr + 4);
-
-    aVal1 = _mm256_extractf128_ps(aVal, 0);
-    aVal2 = _mm256_extractf128_ps(aVal, 1);
-
-    aDbl1 = _mm256_cvtps_pd(aVal1);
-    aDbl2 = _mm256_cvtps_pd(aVal2);
-
-    cVal1 = _mm256_add_pd(aDbl1, bVal1);
-    cVal2 = _mm256_add_pd(aDbl2, bVal2);
-
-    _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
-    _mm256_store_pd(cPtr + 4,
-                    cVal2); // Store the results back into the C container
-
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
-
-  number = eighth_points * 8;
-  for (; number < num_points; number++) {
-    *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
-  }
+static inline void volk_32f_64f_add_64f_a_avx(double* cVector,
+                                              const float* aVector,
+                                              const double* bVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighth_points = num_points / 8;
+
+    double* cPtr = cVector;
+    const float* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    __m256 aVal;
+    __m128 aVal1, aVal2;
+    __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
+    for (; number < eighth_points; number++) {
+
+        aVal = _mm256_load_ps(aPtr);
+        bVal1 = _mm256_load_pd(bPtr);
+        bVal2 = _mm256_load_pd(bPtr + 4);
+
+        aVal1 = _mm256_extractf128_ps(aVal, 0);
+        aVal2 = _mm256_extractf128_ps(aVal, 1);
+
+        aDbl1 = _mm256_cvtps_pd(aVal1);
+        aDbl2 = _mm256_cvtps_pd(aVal2);
+
+        cVal1 = _mm256_add_pd(aDbl1, bVal1);
+        cVal2 = _mm256_add_pd(aDbl2, bVal2);
+
+        _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
+        _mm256_store_pd(cPtr + 4,
+                        cVal2); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighth_points * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX */
diff --git a/kernels/volk/volk_32f_64f_multiply_64f.h b/kernels/volk/volk_32f_64f_multiply_64f.h

index 50f08a17390bf695e973718e57b130e78b0077bf..10398505106b6d0a79874d857b738735eb65f38f 100644 (file)
--- a/kernels/volk/volk_32f_64f_multiply_64f.h
+++ b/kernels/volk/volk_32f_64f_multiply_64f.h
@@ -31,8 +31,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_64f_multiply_64f(double* cVector, const double* aVector, const double* bVector, unsigned int num_points)
- * \endcode
+ * void volk_32f_64f_multiply_64f(double* cVector, const double* aVector, const double*
+ * bVector, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li aVector: First input vector.
@@ -76,18 +76,19 @@
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_64f_multiply_64f_generic(double *cVector, const float *aVector,
-                                 const double *bVector, unsigned int num_points)
+static inline void volk_32f_64f_multiply_64f_generic(double* cVector,
+                                                     const float* aVector,
+                                                     const double* bVector,
+                                                     unsigned int num_points)
  {
-  double *cPtr = cVector;
-  const float *aPtr = aVector;
-  const double *bPtr = bVector;
-  unsigned int number = 0;
-
-  for (number = 0; number < num_points; number++) {
-    *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
-  }
+    double* cPtr = cVector;
+    const float* aPtr = aVector;
+    const double* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_GENERIC */
@@ -102,47 +103,48 @@ volk_32f_64f_multiply_64f_generic(double *cVector, const float *aVector,
  #include <immintrin.h>
  #include <xmmintrin.h>
  
-static inline void
-volk_32f_64f_multiply_64f_u_avx(double *cVector, const float *aVector,
-                               const double *bVector, unsigned int num_points)
+static inline void volk_32f_64f_multiply_64f_u_avx(double* cVector,
+                                                   const float* aVector,
+                                                   const double* bVector,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighth_points = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighth_points = num_points / 8;
  
-  double *cPtr = cVector;
-  const float *aPtr = aVector;
-  const double *bPtr = bVector;
+    double* cPtr = cVector;
+    const float* aPtr = aVector;
+    const double* bPtr = bVector;
  
-  __m256 aVal;
-  __m128 aVal1, aVal2;
-  __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
-  for (; number < eighth_points; number++) {
+    __m256 aVal;
+    __m128 aVal1, aVal2;
+    __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
+    for (; number < eighth_points; number++) {
  
-    aVal = _mm256_loadu_ps(aPtr);
-    bVal1 = _mm256_loadu_pd(bPtr);
-    bVal2 = _mm256_loadu_pd(bPtr+4);
+        aVal = _mm256_loadu_ps(aPtr);
+        bVal1 = _mm256_loadu_pd(bPtr);
+        bVal2 = _mm256_loadu_pd(bPtr + 4);
  
-    aVal1 = _mm256_extractf128_ps(aVal, 0);
-    aVal2 = _mm256_extractf128_ps(aVal, 1);
+        aVal1 = _mm256_extractf128_ps(aVal, 0);
+        aVal2 = _mm256_extractf128_ps(aVal, 1);
  
-    aDbl1 = _mm256_cvtps_pd(aVal1);
-    aDbl2 = _mm256_cvtps_pd(aVal2);
+        aDbl1 = _mm256_cvtps_pd(aVal1);
+        aDbl2 = _mm256_cvtps_pd(aVal2);
  
-    cVal1 = _mm256_mul_pd(aDbl1, bVal1);
-    cVal2 = _mm256_mul_pd(aDbl2, bVal2);
+        cVal1 = _mm256_mul_pd(aDbl1, bVal1);
+        cVal2 = _mm256_mul_pd(aDbl2, bVal2);
  
-    _mm256_storeu_pd(cPtr, cVal1); // Store the results back into the C container
-    _mm256_storeu_pd(cPtr+4, cVal2); // Store the results back into the C container
+        _mm256_storeu_pd(cPtr, cVal1);     // Store the results back into the C container
+        _mm256_storeu_pd(cPtr + 4, cVal2); // Store the results back into the C container
  
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
  
-  number = eighth_points * 8;
-  for (; number < num_points; number++) {
-    *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
-  }
+    number = eighth_points * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX */
@@ -153,51 +155,51 @@ volk_32f_64f_multiply_64f_u_avx(double *cVector, const float *aVector,
  #include <immintrin.h>
  #include <xmmintrin.h>
  
-static inline void
-volk_32f_64f_multiply_64f_a_avx(double *cVector, const float *aVector,
-                                const double *bVector, unsigned int num_points)
+static inline void volk_32f_64f_multiply_64f_a_avx(double* cVector,
+                                                   const float* aVector,
+                                                   const double* bVector,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighth_points = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighth_points = num_points / 8;
  
-  double *cPtr = cVector;
-  const float *aPtr = aVector;
-  const double *bPtr = bVector;
+    double* cPtr = cVector;
+    const float* aPtr = aVector;
+    const double* bPtr = bVector;
  
-  __m256 aVal;
-  __m128 aVal1, aVal2;
-  __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
-  for (; number < eighth_points; number++) {
+    __m256 aVal;
+    __m128 aVal1, aVal2;
+    __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
+    for (; number < eighth_points; number++) {
  
-    aVal = _mm256_load_ps(aPtr);
-    bVal1 = _mm256_load_pd(bPtr);
-    bVal2 = _mm256_load_pd(bPtr+4);
+        aVal = _mm256_load_ps(aPtr);
+        bVal1 = _mm256_load_pd(bPtr);
+        bVal2 = _mm256_load_pd(bPtr + 4);
  
-    aVal1 = _mm256_extractf128_ps(aVal, 0);
-    aVal2 = _mm256_extractf128_ps(aVal, 1);
+        aVal1 = _mm256_extractf128_ps(aVal, 0);
+        aVal2 = _mm256_extractf128_ps(aVal, 1);
  
-    aDbl1 = _mm256_cvtps_pd(aVal1);
-    aDbl2 = _mm256_cvtps_pd(aVal2);
+        aDbl1 = _mm256_cvtps_pd(aVal1);
+        aDbl2 = _mm256_cvtps_pd(aVal2);
  
-    cVal1 = _mm256_mul_pd(aDbl1, bVal1);
-    cVal2 = _mm256_mul_pd(aDbl2, bVal2);
+        cVal1 = _mm256_mul_pd(aDbl1, bVal1);
+        cVal2 = _mm256_mul_pd(aDbl2, bVal2);
  
-    _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
-    _mm256_store_pd(cPtr+4, cVal2); // Store the results back into the C container
+        _mm256_store_pd(cPtr, cVal1);     // Store the results back into the C container
+        _mm256_store_pd(cPtr + 4, cVal2); // Store the results back into the C container
  
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
  
-  number = eighth_points * 8;
-  for (; number < num_points; number++) {
-    *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
-  }
+    number = eighth_points * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX */
  
  
-
  #endif /* INCLUDED_volk_32f_64f_multiply_64f_u_H */
diff --git a/kernels/volk/volk_32f_8u_polarbutterfly_32f.h b/kernels/volk/volk_32f_8u_polarbutterfly_32f.h

index 4aba6c4d49db178a66103f37dc217700745bd926..2198b33275f750b5a5cbf1908d4584c9a52384a2 100644 (file)
--- a/kernels/volk/volk_32f_8u_polarbutterfly_32f.h
+++ b/kernels/volk/volk_32f_8u_polarbutterfly_32f.h
@@ -51,14 +51,17 @@
   * int frame_exp = 10;
   * int frame_size = 0x01 << frame_exp;
   *
- * float* llrs = (float*) volk_malloc(sizeof(float) * frame_size * (frame_exp + 1), volk_get_alignment());
- * unsigned char* u = (unsigned char) volk_malloc(sizeof(unsigned char) * frame_size * (frame_exp + 1), volk_get_alignment());
+ * float* llrs = (float*) volk_malloc(sizeof(float) * frame_size * (frame_exp + 1),
+ * volk_get_alignment()); unsigned char* u = (unsigned char) volk_malloc(sizeof(unsigned
+ * char) * frame_size * (frame_exp + 1), volk_get_alignment());
   *
- *  {some_function_to_write_encoded_bits_to_float_llrs(llrs + frame_size * frame_exp, data)};
+ *  {some_function_to_write_encoded_bits_to_float_llrs(llrs + frame_size * frame_exp,
+ * data)};
   *
   * unsigned int u_num;
   * for(u_num = 0; u_num < frame_size; u_num++){
- *     volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_size, frame_exp, 0, u_num, u_num);
+ *     volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_size, frame_exp, 0, u_num,
+ * u_num);
   *     // next line could first search for frozen bit value and then do bit decision.
   *     u[u_num] = llrs[u_num] > 0 ? 0 : 1;
   * }
@@ -73,130 +76,131 @@
  #include <math.h>
  #include <volk/volk_8u_x2_encodeframepolar_8u.h>
  
-static inline float
-llr_odd(const float la, const float lb)
+static inline float llr_odd(const float la, const float lb)
  {
-  const float ala = fabsf(la);
-  const float alb = fabsf(lb);
-  return copysignf(1.0f, la) * copysignf(1.0f, lb) * (ala > alb ? alb : ala);
+    const float ala = fabsf(la);
+    const float alb = fabsf(lb);
+    return copysignf(1.0f, la) * copysignf(1.0f, lb) * (ala > alb ? alb : ala);
  }
  
-static inline void
-llr_odd_stages(float* llrs, int min_stage, const int depth, const int frame_size, const int row)
+static inline void llr_odd_stages(
+    float* llrs, int min_stage, const int depth, const int frame_size, const int row)
  {
-  int loop_stage = depth - 1;
-  float* dst_llr_ptr;
-  float* src_llr_ptr;
-  int stage_size = 0x01 << loop_stage;
-
-  int el;
-  while(min_stage <= loop_stage){
-    dst_llr_ptr = llrs + loop_stage * frame_size + row;
-    src_llr_ptr = dst_llr_ptr + frame_size;
-    for(el = 0; el < stage_size; el++){
-      *dst_llr_ptr++ = llr_odd(*src_llr_ptr, *(src_llr_ptr + 1));
-      src_llr_ptr += 2;
+    int loop_stage = depth - 1;
+    float* dst_llr_ptr;
+    float* src_llr_ptr;
+    int stage_size = 0x01 << loop_stage;
+
+    int el;
+    while (min_stage <= loop_stage) {
+        dst_llr_ptr = llrs + loop_stage * frame_size + row;
+        src_llr_ptr = dst_llr_ptr + frame_size;
+        for (el = 0; el < stage_size; el++) {
+            *dst_llr_ptr++ = llr_odd(*src_llr_ptr, *(src_llr_ptr + 1));
+            src_llr_ptr += 2;
+        }
+
+        --loop_stage;
+        stage_size >>= 1;
      }
-
-    --loop_stage;
-    stage_size >>= 1;
-  }
  }
  
-static inline float
-llr_even(const float la, const float lb, const unsigned char f)
+static inline float llr_even(const float la, const float lb, const unsigned char f)
  {
-  switch(f){
+    switch (f) {
      case 0:
-      return lb + la;
+        return lb + la;
      default:
-      return lb - la;
-  }
+        return lb - la;
+    }
  }
  
  static inline void
  even_u_values(unsigned char* u_even, const unsigned char* u, const int u_num)
  {
-  u++;
-  int i;
-  for(i = 1; i < u_num; i += 2){
-    *u_even++ = *u;
-    u += 2;
-  }
+    u++;
+    int i;
+    for (i = 1; i < u_num; i += 2) {
+        *u_even++ = *u;
+        u += 2;
+    }
  }
  
  static inline void
  odd_xor_even_values(unsigned char* u_xor, const unsigned char* u, const int u_num)
  {
-  int i;
-  for(i = 1; i < u_num; i += 2){
-    *u_xor++ = *u ^ *(u + 1);
-    u += 2;
-  }
+    int i;
+    for (i = 1; i < u_num; i += 2) {
+        *u_xor++ = *u ^ *(u + 1);
+        u += 2;
+    }
  }
  
-static inline int
-calculate_max_stage_depth_for_row(const int frame_exp, const int row)
+static inline int calculate_max_stage_depth_for_row(const int frame_exp, const int row)
  {
-  int max_stage_depth = 0;
-  int half_stage_size = 0x01;
-  int stage_size = half_stage_size << 1;
-  while(max_stage_depth < (frame_exp - 1)){ // last stage holds received values.
-    if(!(row % stage_size < half_stage_size)){
-      break;
+    int max_stage_depth = 0;
+    int half_stage_size = 0x01;
+    int stage_size = half_stage_size << 1;
+    while (max_stage_depth < (frame_exp - 1)) { // last stage holds received values.
+        if (!(row % stage_size < half_stage_size)) {
+            break;
+        }
+        half_stage_size <<= 1;
+        stage_size <<= 1;
+        max_stage_depth++;
      }
-    half_stage_size <<= 1;
-    stage_size <<= 1;
-    max_stage_depth++;
-  }
-  return max_stage_depth;
+    return max_stage_depth;
  }
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_8u_polarbutterfly_32f_generic(float* llrs, unsigned char* u,
-    const int frame_exp,
-    const int stage, const int u_num, const int row)
+static inline void volk_32f_8u_polarbutterfly_32f_generic(float* llrs,
+                                                          unsigned char* u,
+                                                          const int frame_exp,
+                                                          const int stage,
+                                                          const int u_num,
+                                                          const int row)
  {
-  const int frame_size = 0x01 << frame_exp;
-  const int next_stage = stage + 1;
+    const int frame_size = 0x01 << frame_exp;
+    const int next_stage = stage + 1;
  
-  const int half_stage_size = 0x01 << stage;
-  const int stage_size = half_stage_size << 1;
+    const int half_stage_size = 0x01 << stage;
+    const int stage_size = half_stage_size << 1;
  
-  const bool is_upper_stage_half = row % stage_size < half_stage_size;
+    const bool is_upper_stage_half = row % stage_size < half_stage_size;
  
-//      // this is a natural bit order impl
-  float* next_llrs = llrs + frame_size;// LLRs are stored in a consecutive array.
-  float* call_row_llr = llrs + row;
+    //      // this is a natural bit order impl
+    float* next_llrs = llrs + frame_size; // LLRs are stored in a consecutive array.
+    float* call_row_llr = llrs + row;
  
-  const int section = row - (row % stage_size);
-  const int jump_size = ((row % half_stage_size) << 1) % stage_size;
+    const int section = row - (row % stage_size);
+    const int jump_size = ((row % half_stage_size) << 1) % stage_size;
  
-  const int next_upper_row = section + jump_size;
-  const int next_lower_row = next_upper_row + 1;
+    const int next_upper_row = section + jump_size;
+    const int next_lower_row = next_upper_row + 1;
  
-  const float* upper_right_llr_ptr = next_llrs + next_upper_row;
-  const float* lower_right_llr_ptr = next_llrs + next_lower_row;
+    const float* upper_right_llr_ptr = next_llrs + next_upper_row;
+    const float* lower_right_llr_ptr = next_llrs + next_lower_row;
  
-  if(!is_upper_stage_half){
-    const int u_pos = u_num >> stage;
-    const unsigned char f = u[u_pos - 1];
-    *call_row_llr = llr_even(*upper_right_llr_ptr, *lower_right_llr_ptr, f);
-    return;
-  }
+    if (!is_upper_stage_half) {
+        const int u_pos = u_num >> stage;
+        const unsigned char f = u[u_pos - 1];
+        *call_row_llr = llr_even(*upper_right_llr_ptr, *lower_right_llr_ptr, f);
+        return;
+    }
  
-  if(frame_exp > next_stage){
-    unsigned char* u_half = u + frame_size;
-    odd_xor_even_values(u_half, u, u_num);
-    volk_32f_8u_polarbutterfly_32f_generic(next_llrs, u_half, frame_exp, next_stage, u_num, next_upper_row);
+    if (frame_exp > next_stage) {
+        unsigned char* u_half = u + frame_size;
+        odd_xor_even_values(u_half, u, u_num);
+        volk_32f_8u_polarbutterfly_32f_generic(
+            next_llrs, u_half, frame_exp, next_stage, u_num, next_upper_row);
  
-    even_u_values(u_half, u, u_num);
-    volk_32f_8u_polarbutterfly_32f_generic(next_llrs, u_half, frame_exp, next_stage, u_num, next_lower_row);
-  }
+        even_u_values(u_half, u, u_num);
+        volk_32f_8u_polarbutterfly_32f_generic(
+            next_llrs, u_half, frame_exp, next_stage, u_num, next_lower_row);
+    }
  
-  *call_row_llr = llr_odd(*upper_right_llr_ptr, *lower_right_llr_ptr);
+    *call_row_llr = llr_odd(*upper_right_llr_ptr, *lower_right_llr_ptr);
  }
  
  #endif /* LV_HAVE_GENERIC */
@@ -206,99 +210,99 @@ volk_32f_8u_polarbutterfly_32f_generic(float* llrs, unsigned char* u,
  #include <immintrin.h>
  #include <volk/volk_avx_intrinsics.h>
  
-static inline void
-volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs, unsigned char* u,
-    const int frame_exp,
-    const int stage, const int u_num, const int row)
+static inline void volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs,
+                                                        unsigned char* u,
+                                                        const int frame_exp,
+                                                        const int stage,
+                                                        const int u_num,
+                                                        const int row)
  {
-  const int frame_size = 0x01 << frame_exp;
-  if(row % 2){ // for odd rows just do the only necessary calculation and return.
-    const float* next_llrs = llrs + frame_size + row;
-    *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
-    return;
-  }
-
-  const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
-  if(max_stage_depth < 3){ // vectorized version needs larger vectors.
-    volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
-    return;
-  }
-
-  int loop_stage = max_stage_depth;
-  int stage_size = 0x01 << loop_stage;
-
-  float* src_llr_ptr;
-  float* dst_llr_ptr;
-
-  __m256 src0, src1, dst;
-
-  if(row){ // not necessary for ZERO row. == first bit to be decoded.
-    // first do bit combination for all stages
-    // effectively encode some decoded bits again.
-    unsigned char* u_target = u + frame_size;
-    unsigned char* u_temp = u + 2* frame_size;
-    memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
-
-    if(stage_size > 15){
-      _mm256_zeroupper();
-      volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
+    const int frame_size = 0x01 << frame_exp;
+    if (row % 2) { // for odd rows just do the only necessary calculation and return.
+        const float* next_llrs = llrs + frame_size + row;
+        *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
+        return;
      }
-    else{
-      volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
+
+    const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
+    if (max_stage_depth < 3) { // vectorized version needs larger vectors.
+        volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
+        return;
      }
  
-    src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
-    dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
+    int loop_stage = max_stage_depth;
+    int stage_size = 0x01 << loop_stage;
  
-    __m128i fbits;
+    float* src_llr_ptr;
+    float* dst_llr_ptr;
  
-    int p;
-    for(p = 0; p < stage_size; p += 8){
-      _mm256_zeroupper();
-      fbits = _mm_loadu_si128((__m128i*) u_target);
-      u_target += 8;
+    __m256 src0, src1, dst;
  
-      src0 = _mm256_loadu_ps(src_llr_ptr);
-      src1 = _mm256_loadu_ps(src_llr_ptr + 8);
-      src_llr_ptr += 16;
+    if (row) { // not necessary for ZERO row. == first bit to be decoded.
+        // first do bit combination for all stages
+        // effectively encode some decoded bits again.
+        unsigned char* u_target = u + frame_size;
+        unsigned char* u_temp = u + 2 * frame_size;
+        memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
  
-      dst = _mm256_polar_fsign_add_llrs(src0, src1, fbits);
+        if (stage_size > 15) {
+            _mm256_zeroupper();
+            volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
+        } else {
+            volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
+        }
  
-      _mm256_storeu_ps(dst_llr_ptr, dst);
-      dst_llr_ptr += 8;
-    }
+        src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
+        dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
  
-    --loop_stage;
-    stage_size >>= 1;
-  }
+        __m128i fbits;
  
-  const int min_stage = stage > 2 ? stage : 2;
+        int p;
+        for (p = 0; p < stage_size; p += 8) {
+            _mm256_zeroupper();
+            fbits = _mm_loadu_si128((__m128i*)u_target);
+            u_target += 8;
  
-  _mm256_zeroall(); // Important to clear cache!
+            src0 = _mm256_loadu_ps(src_llr_ptr);
+            src1 = _mm256_loadu_ps(src_llr_ptr + 8);
+            src_llr_ptr += 16;
  
-  int el;
-  while(min_stage < loop_stage){
-    dst_llr_ptr = llrs + loop_stage * frame_size + row;
-    src_llr_ptr = dst_llr_ptr + frame_size;
-    for(el = 0; el < stage_size; el += 8){
-      src0 = _mm256_loadu_ps(src_llr_ptr);
-      src_llr_ptr += 8;
-      src1 = _mm256_loadu_ps(src_llr_ptr);
-      src_llr_ptr += 8;
+            dst = _mm256_polar_fsign_add_llrs(src0, src1, fbits);
  
-      dst = _mm256_polar_minsum_llrs(src0, src1);
+            _mm256_storeu_ps(dst_llr_ptr, dst);
+            dst_llr_ptr += 8;
+        }
  
-      _mm256_storeu_ps(dst_llr_ptr, dst);
-      dst_llr_ptr += 8;
+        --loop_stage;
+        stage_size >>= 1;
      }
  
-    --loop_stage;
-    stage_size >>= 1;
+    const int min_stage = stage > 2 ? stage : 2;
+
+    _mm256_zeroall(); // Important to clear cache!
  
-  }
+    int el;
+    while (min_stage < loop_stage) {
+        dst_llr_ptr = llrs + loop_stage * frame_size + row;
+        src_llr_ptr = dst_llr_ptr + frame_size;
+        for (el = 0; el < stage_size; el += 8) {
+            src0 = _mm256_loadu_ps(src_llr_ptr);
+            src_llr_ptr += 8;
+            src1 = _mm256_loadu_ps(src_llr_ptr);
+            src_llr_ptr += 8;
  
-  // for stages < 3 vectors are too small!.
-  llr_odd_stages(llrs, stage, loop_stage + 1,frame_size, row);
+            dst = _mm256_polar_minsum_llrs(src0, src1);
+
+            _mm256_storeu_ps(dst_llr_ptr, dst);
+            dst_llr_ptr += 8;
+        }
+
+        --loop_stage;
+        stage_size >>= 1;
+    }
+
+    // for stages < 3 vectors are too small!.
+    llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row);
  }
  
  #endif /* LV_HAVE_AVX */
@@ -307,99 +311,99 @@ volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs, unsigned char* u,
  #include <immintrin.h>
  #include <volk/volk_avx2_intrinsics.h>
  
-static inline void
-volk_32f_8u_polarbutterfly_32f_u_avx2(float* llrs, unsigned char* u,
-    const int frame_exp,
-    const int stage, const int u_num, const int row)
+static inline void volk_32f_8u_polarbutterfly_32f_u_avx2(float* llrs,
+                                                         unsigned char* u,
+                                                         const int frame_exp,
+                                                         const int stage,
+                                                         const int u_num,
+                                                         const int row)
  {
-  const int frame_size = 0x01 << frame_exp;
-  if(row % 2){ // for odd rows just do the only necessary calculation and return.
-    const float* next_llrs = llrs + frame_size + row;
-    *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
-    return;
-  }
-
-  const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
-  if(max_stage_depth < 3){ // vectorized version needs larger vectors.
-    volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
-    return;
-  }
-
-  int loop_stage = max_stage_depth;
-  int stage_size = 0x01 << loop_stage;
-
-  float* src_llr_ptr;
-  float* dst_llr_ptr;
-
-  __m256 src0, src1, dst;
-
-  if(row){ // not necessary for ZERO row. == first bit to be decoded.
-    // first do bit combination for all stages
-    // effectively encode some decoded bits again.
-    unsigned char* u_target = u + frame_size;
-    unsigned char* u_temp = u + 2* frame_size;
-    memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
-
-    if(stage_size > 15){
-      _mm256_zeroupper();
-      volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
+    const int frame_size = 0x01 << frame_exp;
+    if (row % 2) { // for odd rows just do the only necessary calculation and return.
+        const float* next_llrs = llrs + frame_size + row;
+        *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
+        return;
      }
-    else{
-      volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
+
+    const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
+    if (max_stage_depth < 3) { // vectorized version needs larger vectors.
+        volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
+        return;
      }
  
-    src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
-    dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
+    int loop_stage = max_stage_depth;
+    int stage_size = 0x01 << loop_stage;
  
-    __m128i fbits;
+    float* src_llr_ptr;
+    float* dst_llr_ptr;
  
-    int p;
-    for(p = 0; p < stage_size; p += 8){
-      _mm256_zeroupper();
-      fbits = _mm_loadu_si128((__m128i*) u_target);
-      u_target += 8;
+    __m256 src0, src1, dst;
  
-      src0 = _mm256_loadu_ps(src_llr_ptr);
-      src1 = _mm256_loadu_ps(src_llr_ptr + 8);
-      src_llr_ptr += 16;
+    if (row) { // not necessary for ZERO row. == first bit to be decoded.
+        // first do bit combination for all stages
+        // effectively encode some decoded bits again.
+        unsigned char* u_target = u + frame_size;
+        unsigned char* u_temp = u + 2 * frame_size;
+        memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
  
-      dst = _mm256_polar_fsign_add_llrs_avx2(src0, src1, fbits);
+        if (stage_size > 15) {
+            _mm256_zeroupper();
+            volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
+        } else {
+            volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
+        }
  
-      _mm256_storeu_ps(dst_llr_ptr, dst);
-      dst_llr_ptr += 8;
-    }
+        src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
+        dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
  
-    --loop_stage;
-    stage_size >>= 1;
-  }
+        __m128i fbits;
  
-  const int min_stage = stage > 2 ? stage : 2;
+        int p;
+        for (p = 0; p < stage_size; p += 8) {
+            _mm256_zeroupper();
+            fbits = _mm_loadu_si128((__m128i*)u_target);
+            u_target += 8;
  
-  _mm256_zeroall(); // Important to clear cache!
+            src0 = _mm256_loadu_ps(src_llr_ptr);
+            src1 = _mm256_loadu_ps(src_llr_ptr + 8);
+            src_llr_ptr += 16;
  
-  int el;
-  while(min_stage < loop_stage){
-    dst_llr_ptr = llrs + loop_stage * frame_size + row;
-    src_llr_ptr = dst_llr_ptr + frame_size;
-    for(el = 0; el < stage_size; el += 8){
-      src0 = _mm256_loadu_ps(src_llr_ptr);
-      src_llr_ptr += 8;
-      src1 = _mm256_loadu_ps(src_llr_ptr);
-      src_llr_ptr += 8;
+            dst = _mm256_polar_fsign_add_llrs_avx2(src0, src1, fbits);
  
-      dst = _mm256_polar_minsum_llrs(src0, src1);
+            _mm256_storeu_ps(dst_llr_ptr, dst);
+            dst_llr_ptr += 8;
+        }
  
-      _mm256_storeu_ps(dst_llr_ptr, dst);
-      dst_llr_ptr += 8;
+        --loop_stage;
+        stage_size >>= 1;
      }
  
-    --loop_stage;
-    stage_size >>= 1;
+    const int min_stage = stage > 2 ? stage : 2;
+
+    _mm256_zeroall(); // Important to clear cache!
+
+    int el;
+    while (min_stage < loop_stage) {
+        dst_llr_ptr = llrs + loop_stage * frame_size + row;
+        src_llr_ptr = dst_llr_ptr + frame_size;
+        for (el = 0; el < stage_size; el += 8) {
+            src0 = _mm256_loadu_ps(src_llr_ptr);
+            src_llr_ptr += 8;
+            src1 = _mm256_loadu_ps(src_llr_ptr);
+            src_llr_ptr += 8;
  
-  }
+            dst = _mm256_polar_minsum_llrs(src0, src1);
+
+            _mm256_storeu_ps(dst_llr_ptr, dst);
+            dst_llr_ptr += 8;
+        }
+
+        --loop_stage;
+        stage_size >>= 1;
+    }
  
-  // for stages < 3 vectors are too small!.
-  llr_odd_stages(llrs, stage, loop_stage + 1,frame_size, row);
+    // for stages < 3 vectors are too small!.
+    llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row);
  }
  
  #endif /* LV_HAVE_AVX2 */
diff --git a/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h b/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h

index fa40a86877050a9f681181fdc3c17a1d284ba28b..6f97dd1fcfc7532fb803bc8b8f3f3e3145fe024a 100644 (file)
--- a/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h
+++ b/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h
@@ -33,124 +33,129 @@
  #include <volk/volk_8u_x3_encodepolarpuppet_8u.h>
  
  
-static inline void
-sanitize_bytes(unsigned char* u, const int elements)
+static inline void sanitize_bytes(unsigned char* u, const int elements)
  {
-  int i;
-  unsigned char* u_ptr = u;
-  for(i = 0; i < elements; i++){
-    *u_ptr = (*u_ptr & 0x01);
-    u_ptr++;
-  }
+    int i;
+    unsigned char* u_ptr = u;
+    for (i = 0; i < elements; i++) {
+        *u_ptr = (*u_ptr & 0x01);
+        u_ptr++;
+    }
  }
  
-static inline void
-clean_up_intermediate_values(float* llrs, unsigned char* u, const int frame_size, const int elements)
+static inline void clean_up_intermediate_values(float* llrs,
+                                                unsigned char* u,
+                                                const int frame_size,
+                                                const int elements)
  {
-  memset(u + frame_size, 0, sizeof(unsigned char) * (elements - frame_size));
-  memset(llrs + frame_size, 0, sizeof(float) * (elements - frame_size));
+    memset(u + frame_size, 0, sizeof(unsigned char) * (elements - frame_size));
+    memset(llrs + frame_size, 0, sizeof(float) * (elements - frame_size));
  }
  
  static inline void
  generate_error_free_input_vector(float* llrs, unsigned char* u, const int frame_size)
  {
-  memset(u, 0, frame_size);
-  unsigned char* target = u + frame_size;
-  volk_8u_x2_encodeframepolar_8u_generic(target, u + 2 * frame_size, frame_size);
-  float* ft = llrs;
-  int i;
-  for(i = 0; i < frame_size; i++){
-    *ft = (-2 * ((float) *target++)) + 1.0f;
-    ft++;
-  }
+    memset(u, 0, frame_size);
+    unsigned char* target = u + frame_size;
+    volk_8u_x2_encodeframepolar_8u_generic(target, u + 2 * frame_size, frame_size);
+    float* ft = llrs;
+    int i;
+    for (i = 0; i < frame_size; i++) {
+        *ft = (-2 * ((float)*target++)) + 1.0f;
+        ft++;
+    }
  }
  
  static inline void
  print_llr_tree(const float* llrs, const int frame_size, const int frame_exp)
  {
-  int s, e;
-  for(s = 0; s < frame_size; s++){
-    for(e = 0; e < frame_exp + 1; e++){
-      printf("%+4.2f ", llrs[e * frame_size + s]);
-    }
-    printf("\n");
-    if((s + 1) % 8 == 0){
-      printf("\n");
+    int s, e;
+    for (s = 0; s < frame_size; s++) {
+        for (e = 0; e < frame_exp + 1; e++) {
+            printf("%+4.2f ", llrs[e * frame_size + s]);
+        }
+        printf("\n");
+        if ((s + 1) % 8 == 0) {
+            printf("\n");
+        }
      }
-  }
  }
  
-static inline int
-maximum_frame_size(const int elements)
+static inline int maximum_frame_size(const int elements)
  {
-  unsigned int frame_size = next_lower_power_of_two(elements);
-  unsigned int frame_exp = log2_of_power_of_2(frame_size);
-  return next_lower_power_of_two(frame_size / frame_exp);
+    unsigned int frame_size = next_lower_power_of_two(elements);
+    unsigned int frame_exp = log2_of_power_of_2(frame_size);
+    return next_lower_power_of_two(frame_size / frame_exp);
  }
  
  #ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_8u_polarbutterflypuppet_32f_generic(float* llrs, const float* input, unsigned char* u, const int elements)
+static inline void volk_32f_8u_polarbutterflypuppet_32f_generic(float* llrs,
+                                                                const float* input,
+                                                                unsigned char* u,
+                                                                const int elements)
  {
-  unsigned int frame_size = maximum_frame_size(elements);
-  unsigned int frame_exp = log2_of_power_of_2(frame_size);
+    unsigned int frame_size = maximum_frame_size(elements);
+    unsigned int frame_exp = log2_of_power_of_2(frame_size);
  
-  sanitize_bytes(u, elements);
-  clean_up_intermediate_values(llrs, u, frame_size, elements);
-  generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
+    sanitize_bytes(u, elements);
+    clean_up_intermediate_values(llrs, u, frame_size, elements);
+    generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
  
-  unsigned int u_num = 0;
-  for(; u_num < frame_size; u_num++){
-    volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, 0, u_num, u_num);
-    u[u_num] = llrs[u_num] > 0 ? 0 : 1;
-  }
+    unsigned int u_num = 0;
+    for (; u_num < frame_size; u_num++) {
+        volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, 0, u_num, u_num);
+        u[u_num] = llrs[u_num] > 0 ? 0 : 1;
+    }
  
-  clean_up_intermediate_values(llrs, u, frame_size, elements);
+    clean_up_intermediate_values(llrs, u, frame_size, elements);
  }
  #endif /* LV_HAVE_GENERIC */
  
  #ifdef LV_HAVE_AVX
-static inline void
-volk_32f_8u_polarbutterflypuppet_32f_u_avx(float* llrs, const float* input, unsigned char* u, const int elements)
+static inline void volk_32f_8u_polarbutterflypuppet_32f_u_avx(float* llrs,
+                                                              const float* input,
+                                                              unsigned char* u,
+                                                              const int elements)
  {
-  unsigned int frame_size = maximum_frame_size(elements);
-  unsigned int frame_exp = log2_of_power_of_2(frame_size);
+    unsigned int frame_size = maximum_frame_size(elements);
+    unsigned int frame_exp = log2_of_power_of_2(frame_size);
  
-  sanitize_bytes(u, elements);
-  clean_up_intermediate_values(llrs, u, frame_size, elements);
-  generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
+    sanitize_bytes(u, elements);
+    clean_up_intermediate_values(llrs, u, frame_size, elements);
+    generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
  
-  unsigned int u_num = 0;
-  for(; u_num < frame_size; u_num++){
-    volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_exp, 0, u_num, u_num);
-    u[u_num] = llrs[u_num] > 0 ? 0 : 1;
-  }
+    unsigned int u_num = 0;
+    for (; u_num < frame_size; u_num++) {
+        volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_exp, 0, u_num, u_num);
+        u[u_num] = llrs[u_num] > 0 ? 0 : 1;
+    }
  
-  clean_up_intermediate_values(llrs, u, frame_size, elements);
+    clean_up_intermediate_values(llrs, u, frame_size, elements);
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_AVX2
-static inline void
-volk_32f_8u_polarbutterflypuppet_32f_u_avx2(float* llrs, const float* input, unsigned char* u, const int elements)
+static inline void volk_32f_8u_polarbutterflypuppet_32f_u_avx2(float* llrs,
+                                                               const float* input,
+                                                               unsigned char* u,
+                                                               const int elements)
  {
-  unsigned int frame_size = maximum_frame_size(elements);
-  unsigned int frame_exp = log2_of_power_of_2(frame_size);
+    unsigned int frame_size = maximum_frame_size(elements);
+    unsigned int frame_exp = log2_of_power_of_2(frame_size);
  
-  sanitize_bytes(u, elements);
-  clean_up_intermediate_values(llrs, u, frame_size, elements);
-  generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
+    sanitize_bytes(u, elements);
+    clean_up_intermediate_values(llrs, u, frame_size, elements);
+    generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
  
-  unsigned int u_num = 0;
-  for(; u_num < frame_size; u_num++){
-    volk_32f_8u_polarbutterfly_32f_u_avx2(llrs, u, frame_exp, 0, u_num, u_num);
-    u[u_num] = llrs[u_num] > 0 ? 0 : 1;
-  }
+    unsigned int u_num = 0;
+    for (; u_num < frame_size; u_num++) {
+        volk_32f_8u_polarbutterfly_32f_u_avx2(llrs, u, frame_exp, 0, u_num, u_num);
+        u[u_num] = llrs[u_num] > 0 ? 0 : 1;
+    }
  
-  clean_up_intermediate_values(llrs, u, frame_size, elements);
+    clean_up_intermediate_values(llrs, u, frame_size, elements);
  }
  #endif /* LV_HAVE_AVX2 */
  
  
-
  #endif /* VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLYPUPPET_32F_H_ */
diff --git a/kernels/volk/volk_32f_accumulator_s32f.h b/kernels/volk/volk_32f_accumulator_s32f.h

index f6219c89d7d9e82d47c1ddf732cb8b090af755d0..9a78f58b5cb4a4f408e2e3ff5790489ec4221e28 100644 (file)
--- a/kernels/volk/volk_32f_accumulator_s32f.h
+++ b/kernels/volk/volk_32f_accumulator_s32f.h
@@ -29,8 +29,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_accumulator_s32f(float* result, const float* inputBuffer, unsigned int num_points)
- * \endcode
+ * void volk_32f_accumulator_s32f(float* result, const float* inputBuffer, unsigned int
+ * num_points) \endcode
   *
   * \b Inputs
   * \li inputBuffer The buffer of data to be accumulated
@@ -63,47 +63,48 @@
  #ifndef INCLUDED_volk_32f_accumulator_s32f_a_H
  #define INCLUDED_volk_32f_accumulator_s32f_a_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_accumulator_s32f_a_avx(float* result, const float* inputBuffer, unsigned int num_points)
+static inline void volk_32f_accumulator_s32f_a_avx(float* result,
+                                                   const float* inputBuffer,
+                                                   unsigned int num_points)
  {
-  float returnValue = 0;
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  const float* aPtr = inputBuffer;
-  __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
-
-  __m256 accumulator = _mm256_setzero_ps();
-  __m256 aVal = _mm256_setzero_ps();
-
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_load_ps(aPtr);
-    accumulator = _mm256_add_ps(accumulator, aVal);
-    aPtr += 8;
-  }
-
-  _mm256_store_ps(tempBuffer, accumulator);
-
-  returnValue = tempBuffer[0];
-  returnValue += tempBuffer[1];
-  returnValue += tempBuffer[2];
-  returnValue += tempBuffer[3];
-  returnValue += tempBuffer[4];
-  returnValue += tempBuffer[5];
-  returnValue += tempBuffer[6];
-  returnValue += tempBuffer[7];
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    returnValue += (*aPtr++);
-  }
-  *result = returnValue;
+    float returnValue = 0;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* aPtr = inputBuffer;
+    __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
+
+    __m256 accumulator = _mm256_setzero_ps();
+    __m256 aVal = _mm256_setzero_ps();
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        accumulator = _mm256_add_ps(accumulator, aVal);
+        aPtr += 8;
+    }
+
+    _mm256_store_ps(tempBuffer, accumulator);
+
+    returnValue = tempBuffer[0];
+    returnValue += tempBuffer[1];
+    returnValue += tempBuffer[2];
+    returnValue += tempBuffer[3];
+    returnValue += tempBuffer[4];
+    returnValue += tempBuffer[5];
+    returnValue += tempBuffer[6];
+    returnValue += tempBuffer[7];
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        returnValue += (*aPtr++);
+    }
+    *result = returnValue;
  }
  #endif /* LV_HAVE_AVX */
  
@@ -111,41 +112,42 @@ volk_32f_accumulator_s32f_a_avx(float* result, const float* inputBuffer, unsigne
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_accumulator_s32f_u_avx(float* result, const float* inputBuffer, unsigned int num_points)
+static inline void volk_32f_accumulator_s32f_u_avx(float* result,
+                                                   const float* inputBuffer,
+                                                   unsigned int num_points)
  {
-  float returnValue = 0;
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  const float* aPtr = inputBuffer;
-  __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
-
-  __m256 accumulator = _mm256_setzero_ps();
-  __m256 aVal = _mm256_setzero_ps();
-
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_loadu_ps(aPtr);
-    accumulator = _mm256_add_ps(accumulator, aVal);
-    aPtr += 8;
-  }
-
-  _mm256_store_ps(tempBuffer, accumulator);
-
-  returnValue = tempBuffer[0];
-  returnValue += tempBuffer[1];
-  returnValue += tempBuffer[2];
-  returnValue += tempBuffer[3];
-  returnValue += tempBuffer[4];
-  returnValue += tempBuffer[5];
-  returnValue += tempBuffer[6];
-  returnValue += tempBuffer[7];
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    returnValue += (*aPtr++);
-  }
-  *result = returnValue;
+    float returnValue = 0;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* aPtr = inputBuffer;
+    __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
+
+    __m256 accumulator = _mm256_setzero_ps();
+    __m256 aVal = _mm256_setzero_ps();
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        accumulator = _mm256_add_ps(accumulator, aVal);
+        aPtr += 8;
+    }
+
+    _mm256_store_ps(tempBuffer, accumulator);
+
+    returnValue = tempBuffer[0];
+    returnValue += tempBuffer[1];
+    returnValue += tempBuffer[2];
+    returnValue += tempBuffer[3];
+    returnValue += tempBuffer[4];
+    returnValue += tempBuffer[5];
+    returnValue += tempBuffer[6];
+    returnValue += tempBuffer[7];
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        returnValue += (*aPtr++);
+    }
+    *result = returnValue;
  }
  #endif /* LV_HAVE_AVX */
  
@@ -153,37 +155,38 @@ volk_32f_accumulator_s32f_u_avx(float* result, const float* inputBuffer, unsigne
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigned int num_points)
+static inline void volk_32f_accumulator_s32f_a_sse(float* result,
+                                                   const float* inputBuffer,
+                                                   unsigned int num_points)
  {
-  float returnValue = 0;
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* aPtr = inputBuffer;
-  __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
-
-  __m128 accumulator = _mm_setzero_ps();
-  __m128 aVal = _mm_setzero_ps();
-
-  for(;number < quarterPoints; number++){
-    aVal = _mm_load_ps(aPtr);
-    accumulator = _mm_add_ps(accumulator, aVal);
-    aPtr += 4;
-  }
-
-  _mm_store_ps(tempBuffer,accumulator);
-
-  returnValue = tempBuffer[0];
-  returnValue += tempBuffer[1];
-  returnValue += tempBuffer[2];
-  returnValue += tempBuffer[3];
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    returnValue += (*aPtr++);
-  }
-  *result = returnValue;
+    float returnValue = 0;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* aPtr = inputBuffer;
+    __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
+
+    __m128 accumulator = _mm_setzero_ps();
+    __m128 aVal = _mm_setzero_ps();
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        accumulator = _mm_add_ps(accumulator, aVal);
+        aPtr += 4;
+    }
+
+    _mm_store_ps(tempBuffer, accumulator);
+
+    returnValue = tempBuffer[0];
+    returnValue += tempBuffer[1];
+    returnValue += tempBuffer[2];
+    returnValue += tempBuffer[3];
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        returnValue += (*aPtr++);
+    }
+    *result = returnValue;
  }
  #endif /* LV_HAVE_SSE */
  
@@ -191,52 +194,54 @@ volk_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigne
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32f_accumulator_s32f_u_sse(float* result, const float* inputBuffer, unsigned int num_points)
+static inline void volk_32f_accumulator_s32f_u_sse(float* result,
+                                                   const float* inputBuffer,
+                                                   unsigned int num_points)
  {
-  float returnValue = 0;
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* aPtr = inputBuffer;
-  __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
-
-  __m128 accumulator = _mm_setzero_ps();
-  __m128 aVal = _mm_setzero_ps();
-
-  for(;number < quarterPoints; number++){
-    aVal = _mm_load_ps(aPtr);
-    accumulator = _mm_add_ps(accumulator, aVal);
-    aPtr += 4;
-  }
-
-  _mm_store_ps(tempBuffer,accumulator);
-
-  returnValue = tempBuffer[0];
-  returnValue += tempBuffer[1];
-  returnValue += tempBuffer[2];
-  returnValue += tempBuffer[3];
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    returnValue += (*aPtr++);
-  }
-  *result = returnValue;
+    float returnValue = 0;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* aPtr = inputBuffer;
+    __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
+
+    __m128 accumulator = _mm_setzero_ps();
+    __m128 aVal = _mm_setzero_ps();
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        accumulator = _mm_add_ps(accumulator, aVal);
+        aPtr += 4;
+    }
+
+    _mm_store_ps(tempBuffer, accumulator);
+
+    returnValue = tempBuffer[0];
+    returnValue += tempBuffer[1];
+    returnValue += tempBuffer[2];
+    returnValue += tempBuffer[3];
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        returnValue += (*aPtr++);
+    }
+    *result = returnValue;
  }
  #endif /* LV_HAVE_SSE */
  
  #ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points)
+static inline void volk_32f_accumulator_s32f_generic(float* result,
+                                                     const float* inputBuffer,
+                                                     unsigned int num_points)
  {
-  const float* aPtr = inputBuffer;
-  unsigned int number = 0;
-  float returnValue = 0;
-
-  for(;number < num_points; number++){
-    returnValue += (*aPtr++);
-  }
-  *result = returnValue;
+    const float* aPtr = inputBuffer;
+    unsigned int number = 0;
+    float returnValue = 0;
+
+    for (; number < num_points; number++) {
+        returnValue += (*aPtr++);
+    }
+    *result = returnValue;
  }
  #endif /* LV_HAVE_GENERIC */
  
diff --git a/kernels/volk/volk_32f_acos_32f.h b/kernels/volk/volk_32f_acos_32f.h

index 5c14c2f628ab152bb284db12c7adb39a97a2952b..92918cab52fe6528b94e208c5e7e5b2a8ef5db1a 100644 (file)
--- a/kernels/volk/volk_32f_acos_32f.h
+++ b/kernels/volk/volk_32f_acos_32f.h
@@ -67,11 +67,12 @@
   * \endcode
   */
  
-#include <stdio.h>
-#include <math.h>
  #include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
  
-/* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/
+/* This is the number of terms of Taylor series to evaluate, increase this for more
+ * accuracy*/
  #define ACOS_TERMS 2
  
  #ifndef INCLUDED_volk_32f_acos_32f_a_H
@@ -80,62 +81,68 @@
  #if LV_HAVE_AVX2 && LV_HAVE_FMA
  #include <immintrin.h>
  
-static inline void
-volk_32f_acos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_acos_32f_a_avx2_fma(float* bVector,
+                                                const float* aVector,
+                                                unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int eighthPoints = num_points / 8;
-  int i, j;
-
-  __m256 aVal, d, pi, pio2, x, y, z, arccosine;
-  __m256 fzeroes, fones, ftwos, ffours, condition;
-
-  pi = _mm256_set1_ps(3.14159265358979323846);
-  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
-  fzeroes = _mm256_setzero_ps();
-  fones = _mm256_set1_ps(1.0);
-  ftwos = _mm256_set1_ps(2.0);
-  ffours = _mm256_set1_ps(4.0);
-
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_load_ps(aPtr);
-    d = aVal;
-    aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
-    z = aVal;
-    condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
-    z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
-    condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
-    x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
-    for(i = 0; i < 2; i++)
-      x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x,fones)));
-    x = _mm256_div_ps(fones, x);
-    y = fzeroes;
-    for(j = ACOS_TERMS - 1; j >=0 ; j--)
-      y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
-
-    y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
-    condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
-
-    y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
-    arccosine = y;
-    condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
-    arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
-    condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
-    arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
-
-    _mm256_store_ps(bPtr, arccosine);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *bPtr++ = acos(*aPtr++);
-  }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    int i, j;
+
+    __m256 aVal, d, pi, pio2, x, y, z, arccosine;
+    __m256 fzeroes, fones, ftwos, ffours, condition;
+
+    pi = _mm256_set1_ps(3.14159265358979323846);
+    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm256_setzero_ps();
+    fones = _mm256_set1_ps(1.0);
+    ftwos = _mm256_set1_ps(2.0);
+    ffours = _mm256_set1_ps(4.0);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        d = aVal;
+        aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
+                                                          _mm256_sub_ps(fones, aVal))),
+                             aVal);
+        z = aVal;
+        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+        x = _mm256_add_ps(
+            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++)
+            x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
+        x = _mm256_div_ps(fones, x);
+        y = fzeroes;
+        for (j = ACOS_TERMS - 1; j >= 0; j--)
+            y = _mm256_fmadd_ps(
+                y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+
+        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+        y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
+        arccosine = y;
+        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+        arccosine = _mm256_sub_ps(
+            arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
+        condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
+        arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
+
+        _mm256_store_ps(bPtr, arccosine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = acos(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
@@ -147,59 +154,66 @@ volk_32f_acos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int
  static inline void
  volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int eighthPoints = num_points / 8;
-  int i, j;
-
-  __m256 aVal, d, pi, pio2, x, y, z, arccosine;
-  __m256 fzeroes, fones, ftwos, ffours, condition;
-
-  pi = _mm256_set1_ps(3.14159265358979323846);
-  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
-  fzeroes = _mm256_setzero_ps();
-  fones = _mm256_set1_ps(1.0);
-  ftwos = _mm256_set1_ps(2.0);
-  ffours = _mm256_set1_ps(4.0);
-
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_load_ps(aPtr);
-    d = aVal;
-    aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
-    z = aVal;
-    condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
-    z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
-    condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
-    x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
-    for(i = 0; i < 2; i++)
-      x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
-    x = _mm256_div_ps(fones, x);
-    y = fzeroes;
-    for(j = ACOS_TERMS - 1; j >=0 ; j--)
-      y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
-
-    y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
-    condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
-
-    y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
-    arccosine = y;
-    condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
-    arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
-    condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
-    arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
-
-    _mm256_store_ps(bPtr, arccosine);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *bPtr++ = acos(*aPtr++);
-  }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    int i, j;
+
+    __m256 aVal, d, pi, pio2, x, y, z, arccosine;
+    __m256 fzeroes, fones, ftwos, ffours, condition;
+
+    pi = _mm256_set1_ps(3.14159265358979323846);
+    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm256_setzero_ps();
+    fones = _mm256_set1_ps(1.0);
+    ftwos = _mm256_set1_ps(2.0);
+    ffours = _mm256_set1_ps(4.0);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        d = aVal;
+        aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
+                                                          _mm256_sub_ps(fones, aVal))),
+                             aVal);
+        z = aVal;
+        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+        x = _mm256_add_ps(
+            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++)
+            x = _mm256_add_ps(x,
+                              _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+        x = _mm256_div_ps(fones, x);
+        y = fzeroes;
+        for (j = ACOS_TERMS - 1; j >= 0; j--)
+            y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
+                              _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+
+        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+        y = _mm256_add_ps(
+            y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
+        arccosine = y;
+        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+        arccosine = _mm256_sub_ps(
+            arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
+        condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
+        arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
+
+        _mm256_store_ps(bPtr, arccosine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = acos(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX2 for aligned */
@@ -210,59 +224,63 @@ volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_p
  static inline void
  volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int quarterPoints = num_points / 4;
-  int i, j;
-
-  __m128 aVal, d, pi, pio2, x, y, z, arccosine;
-  __m128 fzeroes, fones, ftwos, ffours, condition;
-
-  pi = _mm_set1_ps(3.14159265358979323846);
-  pio2 = _mm_set1_ps(3.14159265358979323846/2);
-  fzeroes = _mm_setzero_ps();
-  fones = _mm_set1_ps(1.0);
-  ftwos = _mm_set1_ps(2.0);
-  ffours = _mm_set1_ps(4.0);
-
-  for(;number < quarterPoints; number++){
-    aVal = _mm_load_ps(aPtr);
-    d = aVal;
-    aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
-    z = aVal;
-    condition = _mm_cmplt_ps(z, fzeroes);
-    z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
-    condition = _mm_cmplt_ps(z, fones);
-    x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
-
-    for(i = 0; i < 2; i++)
-      x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
-    x = _mm_div_ps(fones, x);
-    y = fzeroes;
-    for(j = ACOS_TERMS - 1; j >=0 ; j--)
-      y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
-
-    y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
-    condition = _mm_cmpgt_ps(z, fones);
-
-    y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
-    arccosine = y;
-    condition = _mm_cmplt_ps(aVal, fzeroes);
-    arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
-    condition = _mm_cmplt_ps(d, fzeroes);
-    arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
-
-    _mm_store_ps(bPtr, arccosine);
-    aPtr += 4;
-    bPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *bPtr++ = acosf(*aPtr++);
-  }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    int i, j;
+
+    __m128 aVal, d, pi, pio2, x, y, z, arccosine;
+    __m128 fzeroes, fones, ftwos, ffours, condition;
+
+    pi = _mm_set1_ps(3.14159265358979323846);
+    pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm_setzero_ps();
+    fones = _mm_set1_ps(1.0);
+    ftwos = _mm_set1_ps(2.0);
+    ffours = _mm_set1_ps(4.0);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        d = aVal;
+        aVal = _mm_div_ps(
+            _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
+            aVal);
+        z = aVal;
+        condition = _mm_cmplt_ps(z, fzeroes);
+        z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+        condition = _mm_cmplt_ps(z, fones);
+        x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++)
+            x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+        x = _mm_div_ps(fones, x);
+        y = fzeroes;
+        for (j = ACOS_TERMS - 1; j >= 0; j--)
+            y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
+                           _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
+
+        y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+        condition = _mm_cmpgt_ps(z, fones);
+
+        y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+        arccosine = y;
+        condition = _mm_cmplt_ps(aVal, fzeroes);
+        arccosine =
+            _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
+        condition = _mm_cmplt_ps(d, fzeroes);
+        arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
+
+        _mm_store_ps(bPtr, arccosine);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = acosf(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_SSE4_1 for aligned */
@@ -276,62 +294,68 @@ volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu
  #if LV_HAVE_AVX2 && LV_HAVE_FMA
  #include <immintrin.h>
  
-static inline void
-volk_32f_acos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_acos_32f_u_avx2_fma(float* bVector,
+                                                const float* aVector,
+                                                unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int eighthPoints = num_points / 8;
-  int i, j;
-
-  __m256 aVal, d, pi, pio2, x, y, z, arccosine;
-  __m256 fzeroes, fones, ftwos, ffours, condition;
-
-  pi = _mm256_set1_ps(3.14159265358979323846);
-  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
-  fzeroes = _mm256_setzero_ps();
-  fones = _mm256_set1_ps(1.0);
-  ftwos = _mm256_set1_ps(2.0);
-  ffours = _mm256_set1_ps(4.0);
-
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_loadu_ps(aPtr);
-    d = aVal;
-    aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
-    z = aVal;
-    condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
-    z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
-    condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
-    x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
-    for(i = 0; i < 2; i++)
-      x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x,fones)));
-    x = _mm256_div_ps(fones, x);
-    y = fzeroes;
-    for(j = ACOS_TERMS - 1; j >=0 ; j--)
-      y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
-
-    y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
-    condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
-
-    y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
-    arccosine = y;
-    condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
-    arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
-    condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
-    arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
-
-    _mm256_storeu_ps(bPtr, arccosine);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *bPtr++ = acos(*aPtr++);
-  }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    int i, j;
+
+    __m256 aVal, d, pi, pio2, x, y, z, arccosine;
+    __m256 fzeroes, fones, ftwos, ffours, condition;
+
+    pi = _mm256_set1_ps(3.14159265358979323846);
+    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm256_setzero_ps();
+    fones = _mm256_set1_ps(1.0);
+    ftwos = _mm256_set1_ps(2.0);
+    ffours = _mm256_set1_ps(4.0);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        d = aVal;
+        aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
+                                                          _mm256_sub_ps(fones, aVal))),
+                             aVal);
+        z = aVal;
+        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+        x = _mm256_add_ps(
+            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++)
+            x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
+        x = _mm256_div_ps(fones, x);
+        y = fzeroes;
+        for (j = ACOS_TERMS - 1; j >= 0; j--)
+            y = _mm256_fmadd_ps(
+                y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+
+        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+        y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
+        arccosine = y;
+        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+        arccosine = _mm256_sub_ps(
+            arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
+        condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
+        arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
+
+        _mm256_storeu_ps(bPtr, arccosine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = acos(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
@@ -343,59 +367,66 @@ volk_32f_acos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int
  static inline void
  volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int eighthPoints = num_points / 8;
-  int i, j;
-
-  __m256 aVal, d, pi, pio2, x, y, z, arccosine;
-  __m256 fzeroes, fones, ftwos, ffours, condition;
-
-  pi = _mm256_set1_ps(3.14159265358979323846);
-  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
-  fzeroes = _mm256_setzero_ps();
-  fones = _mm256_set1_ps(1.0);
-  ftwos = _mm256_set1_ps(2.0);
-  ffours = _mm256_set1_ps(4.0);
-
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_loadu_ps(aPtr);
-    d = aVal;
-    aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
-    z = aVal;
-    condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
-    z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
-    condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
-    x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
-    for(i = 0; i < 2; i++)
-      x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
-    x = _mm256_div_ps(fones, x);
-    y = fzeroes;
-    for(j = ACOS_TERMS - 1; j >=0 ; j--)
-      y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
-
-    y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
-    condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
-
-    y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
-    arccosine = y;
-    condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
-    arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
-    condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
-    arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
-
-    _mm256_storeu_ps(bPtr, arccosine);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *bPtr++ = acos(*aPtr++);
-  }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    int i, j;
+
+    __m256 aVal, d, pi, pio2, x, y, z, arccosine;
+    __m256 fzeroes, fones, ftwos, ffours, condition;
+
+    pi = _mm256_set1_ps(3.14159265358979323846);
+    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm256_setzero_ps();
+    fones = _mm256_set1_ps(1.0);
+    ftwos = _mm256_set1_ps(2.0);
+    ffours = _mm256_set1_ps(4.0);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        d = aVal;
+        aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
+                                                          _mm256_sub_ps(fones, aVal))),
+                             aVal);
+        z = aVal;
+        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+        x = _mm256_add_ps(
+            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++)
+            x = _mm256_add_ps(x,
+                              _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+        x = _mm256_div_ps(fones, x);
+        y = fzeroes;
+        for (j = ACOS_TERMS - 1; j >= 0; j--)
+            y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
+                              _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+
+        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+        y = _mm256_add_ps(
+            y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
+        arccosine = y;
+        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+        arccosine = _mm256_sub_ps(
+            arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
+        condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
+        arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
+
+        _mm256_storeu_ps(bPtr, arccosine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = acos(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX2 for unaligned */
@@ -406,60 +437,64 @@ volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_p
  static inline void
  volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int quarterPoints = num_points / 4;
-  int i, j;
-
-  __m128 aVal, d, pi, pio2, x, y, z, arccosine;
-  __m128 fzeroes, fones, ftwos, ffours, condition;
-
-  pi = _mm_set1_ps(3.14159265358979323846);
-  pio2 = _mm_set1_ps(3.14159265358979323846/2);
-  fzeroes = _mm_setzero_ps();
-  fones = _mm_set1_ps(1.0);
-  ftwos = _mm_set1_ps(2.0);
-  ffours = _mm_set1_ps(4.0);
-
-  for(;number < quarterPoints; number++){
-    aVal = _mm_loadu_ps(aPtr);
-    d = aVal;
-    aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
-    z = aVal;
-    condition = _mm_cmplt_ps(z, fzeroes);
-    z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
-    condition = _mm_cmplt_ps(z, fones);
-    x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
-
-    for(i = 0; i < 2; i++)
-      x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
-    x = _mm_div_ps(fones, x);
-    y = fzeroes;
-
-    for(j = ACOS_TERMS - 1; j >=0 ; j--)
-      y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
-
-    y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
-    condition = _mm_cmpgt_ps(z, fones);
-
-    y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
-    arccosine = y;
-    condition = _mm_cmplt_ps(aVal, fzeroes);
-    arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
-    condition = _mm_cmplt_ps(d, fzeroes);
-    arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
-
-    _mm_storeu_ps(bPtr, arccosine);
-    aPtr += 4;
-    bPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *bPtr++ = acosf(*aPtr++);
-  }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    int i, j;
+
+    __m128 aVal, d, pi, pio2, x, y, z, arccosine;
+    __m128 fzeroes, fones, ftwos, ffours, condition;
+
+    pi = _mm_set1_ps(3.14159265358979323846);
+    pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm_setzero_ps();
+    fones = _mm_set1_ps(1.0);
+    ftwos = _mm_set1_ps(2.0);
+    ffours = _mm_set1_ps(4.0);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_loadu_ps(aPtr);
+        d = aVal;
+        aVal = _mm_div_ps(
+            _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
+            aVal);
+        z = aVal;
+        condition = _mm_cmplt_ps(z, fzeroes);
+        z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+        condition = _mm_cmplt_ps(z, fones);
+        x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++)
+            x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+        x = _mm_div_ps(fones, x);
+        y = fzeroes;
+
+        for (j = ACOS_TERMS - 1; j >= 0; j--)
+            y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
+                           _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
+
+        y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+        condition = _mm_cmpgt_ps(z, fones);
+
+        y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+        arccosine = y;
+        condition = _mm_cmplt_ps(aVal, fzeroes);
+        arccosine =
+            _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
+        condition = _mm_cmplt_ps(d, fzeroes);
+        arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
+
+        _mm_storeu_ps(bPtr, arccosine);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = acosf(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_SSE4_1 for aligned */
@@ -469,14 +504,13 @@ volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int nu
  static inline void
  volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *bPtr++ = acosf(*aPtr++);
-  }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
  
+    for (number = 0; number < num_points; number++) {
+        *bPtr++ = acosf(*aPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
diff --git a/kernels/volk/volk_32f_asin_32f.h b/kernels/volk/volk_32f_asin_32f.h

index 864cfcfa02aaeb4b12fc07f60bbd48a6585f73b7..946d38280efd7418ce879eb43888f18403bca4ad 100644 (file)
--- a/kernels/volk/volk_32f_asin_32f.h
+++ b/kernels/volk/volk_32f_asin_32f.h
@@ -67,11 +67,12 @@
   * \endcode
   */
  
-#include <stdio.h>
-#include <math.h>
  #include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
  
-/* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/
+/* This is the number of terms of Taylor series to evaluate, increase this for more
+ * accuracy*/
  #define ASIN_TERMS 2
  
  #ifndef INCLUDED_volk_32f_asin_32f_a_H
@@ -80,60 +81,66 @@
  #if LV_HAVE_AVX2 && LV_HAVE_FMA
  #include <immintrin.h>
  
-static inline void
-volk_32f_asin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_asin_32f_a_avx2_fma(float* bVector,
+                                                const float* aVector,
+                                                unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int eighthPoints = num_points / 8;
-  int i, j;
-
-  __m256 aVal, pio2, x, y, z, arcsine;
-  __m256 fzeroes, fones, ftwos, ffours, condition;
-
-  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
-  fzeroes = _mm256_setzero_ps();
-  fones = _mm256_set1_ps(1.0);
-  ftwos = _mm256_set1_ps(2.0);
-  ffours = _mm256_set1_ps(4.0);
-
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_load_ps(aPtr);
-    aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))));
-    z = aVal;
-    condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
-    z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
-    condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
-    x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
-    for(i = 0; i < 2; i++){
-      x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones)));
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    int i, j;
+
+    __m256 aVal, pio2, x, y, z, arcsine;
+    __m256 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm256_setzero_ps();
+    fones = _mm256_set1_ps(1.0);
+    ftwos = _mm256_set1_ps(2.0);
+    ffours = _mm256_set1_ps(4.0);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        aVal = _mm256_div_ps(aVal,
+                             _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
+                                                          _mm256_sub_ps(fones, aVal))));
+        z = aVal;
+        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+        x = _mm256_add_ps(
+            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++) {
+            x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
+        }
+        x = _mm256_div_ps(fones, x);
+        y = fzeroes;
+        for (j = ASIN_TERMS - 1; j >= 0; j--) {
+            y = _mm256_fmadd_ps(
+                y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+        }
+
+        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+        y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
+        arcsine = y;
+        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+        arcsine = _mm256_sub_ps(arcsine,
+                                _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
+
+        _mm256_store_ps(bPtr, arcsine);
+        aPtr += 8;
+        bPtr += 8;
      }
-    x = _mm256_div_ps(fones, x);
-    y = fzeroes;
-    for(j = ASIN_TERMS - 1; j >=0 ; j--){
-      y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
-    }
-
-    y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
-    condition = _mm256_cmp_ps(z, fones,_CMP_GT_OS);
-
-    y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition));
-    arcsine = y;
-    condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
-    arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
  
-    _mm256_store_ps(bPtr, arcsine);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *bPtr++ = asin(*aPtr++);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = asin(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
@@ -145,57 +152,64 @@ volk_32f_asin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int
  static inline void
  volk_32f_asin_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int eighthPoints = num_points / 8;
-  int i, j;
-
-  __m256 aVal, pio2, x, y, z, arcsine;
-  __m256 fzeroes, fones, ftwos, ffours, condition;
-
-  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
-  fzeroes = _mm256_setzero_ps();
-  fones = _mm256_set1_ps(1.0);
-  ftwos = _mm256_set1_ps(2.0);
-  ffours = _mm256_set1_ps(4.0);
-
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_load_ps(aPtr);
-    aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))));
-    z = aVal;
-    condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
-    z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
-    condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
-    x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
-    for(i = 0; i < 2; i++){
-      x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    int i, j;
+
+    __m256 aVal, pio2, x, y, z, arcsine;
+    __m256 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm256_setzero_ps();
+    fones = _mm256_set1_ps(1.0);
+    ftwos = _mm256_set1_ps(2.0);
+    ffours = _mm256_set1_ps(4.0);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        aVal = _mm256_div_ps(aVal,
+                             _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
+                                                          _mm256_sub_ps(fones, aVal))));
+        z = aVal;
+        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+        x = _mm256_add_ps(
+            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++) {
+            x = _mm256_add_ps(x,
+                              _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+        }
+        x = _mm256_div_ps(fones, x);
+        y = fzeroes;
+        for (j = ASIN_TERMS - 1; j >= 0; j--) {
+            y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
+                              _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+        }
+
+        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+        y = _mm256_add_ps(
+            y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
+        arcsine = y;
+        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+        arcsine = _mm256_sub_ps(arcsine,
+                                _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
+
+        _mm256_store_ps(bPtr, arcsine);
+        aPtr += 8;
+        bPtr += 8;
      }
-    x = _mm256_div_ps(fones, x);
-    y = fzeroes;
-    for(j = ASIN_TERMS - 1; j >=0 ; j--){
-      y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
-    }
-
-    y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
-    condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
-
-    y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
-    arcsine = y;
-    condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
-    arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
  
-    _mm256_store_ps(bPtr, arcsine);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *bPtr++ = asin(*aPtr++);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = asin(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX for aligned */
@@ -206,57 +220,60 @@ volk_32f_asin_32f_a_avx(float* bVector, const float* aVector, unsigned int num_p
  static inline void
  volk_32f_asin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int quarterPoints = num_points / 4;
-  int i, j;
-
-  __m128 aVal, pio2, x, y, z, arcsine;
-  __m128 fzeroes, fones, ftwos, ffours, condition;
-
-  pio2 = _mm_set1_ps(3.14159265358979323846/2);
-  fzeroes = _mm_setzero_ps();
-  fones = _mm_set1_ps(1.0);
-  ftwos = _mm_set1_ps(2.0);
-  ffours = _mm_set1_ps(4.0);
-
-  for(;number < quarterPoints; number++){
-    aVal = _mm_load_ps(aPtr);
-    aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
-    z = aVal;
-    condition = _mm_cmplt_ps(z, fzeroes);
-    z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
-    condition = _mm_cmplt_ps(z, fones);
-    x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
-
-    for(i = 0; i < 2; i++){
-      x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    int i, j;
+
+    __m128 aVal, pio2, x, y, z, arcsine;
+    __m128 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm_setzero_ps();
+    fones = _mm_set1_ps(1.0);
+    ftwos = _mm_set1_ps(2.0);
+    ffours = _mm_set1_ps(4.0);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        aVal = _mm_div_ps(
+            aVal,
+            _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
+        z = aVal;
+        condition = _mm_cmplt_ps(z, fzeroes);
+        z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+        condition = _mm_cmplt_ps(z, fones);
+        x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++) {
+            x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+        }
+        x = _mm_div_ps(fones, x);
+        y = fzeroes;
+        for (j = ASIN_TERMS - 1; j >= 0; j--) {
+            y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
+                           _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
+        }
+
+        y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+        condition = _mm_cmpgt_ps(z, fones);
+
+        y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+        arcsine = y;
+        condition = _mm_cmplt_ps(aVal, fzeroes);
+        arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
+
+        _mm_store_ps(bPtr, arcsine);
+        aPtr += 4;
+        bPtr += 4;
      }
-    x = _mm_div_ps(fones, x);
-    y = fzeroes;
-    for(j = ASIN_TERMS - 1; j >=0 ; j--){
-      y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
-    }
-
-    y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
-    condition = _mm_cmpgt_ps(z, fones);
-
-    y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
-    arcsine = y;
-    condition = _mm_cmplt_ps(aVal, fzeroes);
-    arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
-
-    _mm_store_ps(bPtr, arcsine);
-    aPtr += 4;
-    bPtr += 4;
-  }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *bPtr++ = asinf(*aPtr++);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = asinf(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_SSE4_1 for aligned */
@@ -269,60 +286,66 @@ volk_32f_asin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu
  #if LV_HAVE_AVX2 && LV_HAVE_FMA
  #include <immintrin.h>
  
-static inline void
-volk_32f_asin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_asin_32f_u_avx2_fma(float* bVector,
+                                                const float* aVector,
+                                                unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int eighthPoints = num_points / 8;
-  int i, j;
-
-  __m256 aVal, pio2, x, y, z, arcsine;
-  __m256 fzeroes, fones, ftwos, ffours, condition;
-
-  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
-  fzeroes = _mm256_setzero_ps();
-  fones = _mm256_set1_ps(1.0);
-  ftwos = _mm256_set1_ps(2.0);
-  ffours = _mm256_set1_ps(4.0);
-
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_loadu_ps(aPtr);
-    aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))));
-    z = aVal;
-    condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
-    z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
-    condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
-    x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
-    for(i = 0; i < 2; i++){
-      x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones)));
-    }
-    x = _mm256_div_ps(fones, x);
-    y = fzeroes;
-    for(j = ASIN_TERMS - 1; j >=0 ; j--){
-      y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    int i, j;
+
+    __m256 aVal, pio2, x, y, z, arcsine;
+    __m256 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm256_setzero_ps();
+    fones = _mm256_set1_ps(1.0);
+    ftwos = _mm256_set1_ps(2.0);
+    ffours = _mm256_set1_ps(4.0);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        aVal = _mm256_div_ps(aVal,
+                             _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
+                                                          _mm256_sub_ps(fones, aVal))));
+        z = aVal;
+        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+        x = _mm256_add_ps(
+            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++) {
+            x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
+        }
+        x = _mm256_div_ps(fones, x);
+        y = fzeroes;
+        for (j = ASIN_TERMS - 1; j >= 0; j--) {
+            y = _mm256_fmadd_ps(
+                y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+        }
+
+        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+        y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
+        arcsine = y;
+        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+        arcsine = _mm256_sub_ps(arcsine,
+                                _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
+
+        _mm256_storeu_ps(bPtr, arcsine);
+        aPtr += 8;
+        bPtr += 8;
      }
  
-    y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
-    condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
-
-    y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition));
-    arcsine = y;
-    condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
-    arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
-
-    _mm256_storeu_ps(bPtr, arcsine);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *bPtr++ = asin(*aPtr++);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = asin(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
@@ -334,57 +357,64 @@ volk_32f_asin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int
  static inline void
  volk_32f_asin_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int eighthPoints = num_points / 8;
-  int i, j;
-
-  __m256 aVal, pio2, x, y, z, arcsine;
-  __m256 fzeroes, fones, ftwos, ffours, condition;
-
-  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
-  fzeroes = _mm256_setzero_ps();
-  fones = _mm256_set1_ps(1.0);
-  ftwos = _mm256_set1_ps(2.0);
-  ffours = _mm256_set1_ps(4.0);
-
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_loadu_ps(aPtr);
-    aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))));
-    z = aVal;
-    condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
-    z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
-    condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
-    x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
-    for(i = 0; i < 2; i++){
-      x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    int i, j;
+
+    __m256 aVal, pio2, x, y, z, arcsine;
+    __m256 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm256_setzero_ps();
+    fones = _mm256_set1_ps(1.0);
+    ftwos = _mm256_set1_ps(2.0);
+    ffours = _mm256_set1_ps(4.0);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        aVal = _mm256_div_ps(aVal,
+                             _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
+                                                          _mm256_sub_ps(fones, aVal))));
+        z = aVal;
+        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+        x = _mm256_add_ps(
+            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++) {
+            x = _mm256_add_ps(x,
+                              _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+        }
+        x = _mm256_div_ps(fones, x);
+        y = fzeroes;
+        for (j = ASIN_TERMS - 1; j >= 0; j--) {
+            y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
+                              _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+        }
+
+        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+        y = _mm256_add_ps(
+            y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
+        arcsine = y;
+        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+        arcsine = _mm256_sub_ps(arcsine,
+                                _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
+
+        _mm256_storeu_ps(bPtr, arcsine);
+        aPtr += 8;
+        bPtr += 8;
      }
-    x = _mm256_div_ps(fones, x);
-    y = fzeroes;
-    for(j = ASIN_TERMS - 1; j >=0 ; j--){
-      y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
-    }
-
-    y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
-    condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
  
-    y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
-    arcsine = y;
-    condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
-    arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
-
-    _mm256_storeu_ps(bPtr, arcsine);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *bPtr++ = asin(*aPtr++);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = asin(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX for unaligned */
@@ -396,57 +426,60 @@ volk_32f_asin_32f_u_avx(float* bVector, const float* aVector, unsigned int num_p
  static inline void
  volk_32f_asin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int quarterPoints = num_points / 4;
-  int i, j;
-
-  __m128 aVal, pio2, x, y, z, arcsine;
-  __m128 fzeroes, fones, ftwos, ffours, condition;
-
-  pio2 = _mm_set1_ps(3.14159265358979323846/2);
-  fzeroes = _mm_setzero_ps();
-  fones = _mm_set1_ps(1.0);
-  ftwos = _mm_set1_ps(2.0);
-  ffours = _mm_set1_ps(4.0);
-
-  for(;number < quarterPoints; number++){
-    aVal = _mm_loadu_ps(aPtr);
-    aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
-    z = aVal;
-    condition = _mm_cmplt_ps(z, fzeroes);
-    z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
-    condition = _mm_cmplt_ps(z, fones);
-    x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
-
-    for(i = 0; i < 2; i++){
-      x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    int i, j;
+
+    __m128 aVal, pio2, x, y, z, arcsine;
+    __m128 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm_setzero_ps();
+    fones = _mm_set1_ps(1.0);
+    ftwos = _mm_set1_ps(2.0);
+    ffours = _mm_set1_ps(4.0);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_loadu_ps(aPtr);
+        aVal = _mm_div_ps(
+            aVal,
+            _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
+        z = aVal;
+        condition = _mm_cmplt_ps(z, fzeroes);
+        z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+        condition = _mm_cmplt_ps(z, fones);
+        x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++) {
+            x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+        }
+        x = _mm_div_ps(fones, x);
+        y = fzeroes;
+        for (j = ASIN_TERMS - 1; j >= 0; j--) {
+            y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
+                           _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
+        }
+
+        y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+        condition = _mm_cmpgt_ps(z, fones);
+
+        y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+        arcsine = y;
+        condition = _mm_cmplt_ps(aVal, fzeroes);
+        arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
+
+        _mm_storeu_ps(bPtr, arcsine);
+        aPtr += 4;
+        bPtr += 4;
      }
-    x = _mm_div_ps(fones, x);
-    y = fzeroes;
-    for(j = ASIN_TERMS - 1; j >=0 ; j--){
-      y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
-    }
-
-    y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
-    condition = _mm_cmpgt_ps(z, fones);
  
-    y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
-    arcsine = y;
-    condition = _mm_cmplt_ps(aVal, fzeroes);
-    arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
-
-    _mm_storeu_ps(bPtr, arcsine);
-    aPtr += 4;
-    bPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *bPtr++ = asinf(*aPtr++);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = asinf(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_SSE4_1 for unaligned */
@@ -456,13 +489,13 @@ volk_32f_asin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int nu
  static inline void
  volk_32f_asin_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
  
-  for(number = 0; number < num_points; number++){
-    *bPtr++ = asinf(*aPtr++);
-  }
+    for (number = 0; number < num_points; number++) {
+        *bPtr++ = asinf(*aPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h

index 3496f0e811e65b35255f4d1bd68e46d0184da31f..6652ee837c10151818556624bcdf59b4851816b8 100644 (file)
--- a/kernels/volk/volk_32f_atan_32f.h
+++ b/kernels/volk/volk_32f_atan_32f.h
@@ -67,11 +67,12 @@
   * \endcode
   */
  
-#include <stdio.h>
-#include <math.h>
  #include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
  
-/* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/
+/* This is the number of terms of Taylor series to evaluate, increase this for more
+ * accuracy*/
  #define TERMS 2
  
  #ifndef INCLUDED_volk_32f_atan_32f_a_H
@@ -80,59 +81,63 @@
  #if LV_HAVE_AVX2 && LV_HAVE_FMA
  #include <immintrin.h>
  
-static inline void
-volk_32f_atan_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_atan_32f_a_avx2_fma(float* bVector,
+                                                const float* aVector,
+                                                unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int eighthPoints = num_points / 8;
-  int i, j;
-
-  __m256 aVal, pio2, x, y, z, arctangent;
-  __m256 fzeroes, fones, ftwos, ffours, condition;
-
-  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
-  fzeroes = _mm256_setzero_ps();
-  fones = _mm256_set1_ps(1.0);
-  ftwos = _mm256_set1_ps(2.0);
-  ffours = _mm256_set1_ps(4.0);
-
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_load_ps(aPtr);
-    z = aVal;
-    condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
-    z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
-    condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
-    x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
-    for(i = 0; i < 2; i++){
-      x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones)));
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    int i, j;
+
+    __m256 aVal, pio2, x, y, z, arctangent;
+    __m256 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm256_setzero_ps();
+    fones = _mm256_set1_ps(1.0);
+    ftwos = _mm256_set1_ps(2.0);
+    ffours = _mm256_set1_ps(4.0);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        z = aVal;
+        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+        x = _mm256_add_ps(
+            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++) {
+            x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
+        }
+        x = _mm256_div_ps(fones, x);
+        y = fzeroes;
+        for (j = TERMS - 1; j >= 0; j--) {
+            y = _mm256_fmadd_ps(
+                y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+        }
+
+        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+        y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
+        arctangent = y;
+        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+        arctangent = _mm256_sub_ps(
+            arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
+
+        _mm256_store_ps(bPtr, arctangent);
+        aPtr += 8;
+        bPtr += 8;
      }
-    x = _mm256_div_ps(fones, x);
-    y = fzeroes;
-    for(j = TERMS - 1; j >=0 ; j--){
-      y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
-    }
-
-    y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
-    condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
-
-    y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition));
-    arctangent = y;
-    condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
-    arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
-
-    _mm256_store_ps(bPtr, arctangent);
-    aPtr += 8;
-    bPtr += 8;
-  }
  
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *bPtr++ = atan(*aPtr++);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = atan(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
@@ -144,56 +149,61 @@ volk_32f_atan_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int
  static inline void
  volk_32f_atan_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int eighthPoints = num_points / 8;
-  int i, j;
-
-  __m256 aVal, pio2, x, y, z, arctangent;
-  __m256 fzeroes, fones, ftwos, ffours, condition;
-
-  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
-  fzeroes = _mm256_setzero_ps();
-  fones = _mm256_set1_ps(1.0);
-  ftwos = _mm256_set1_ps(2.0);
-  ffours = _mm256_set1_ps(4.0);
-
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_load_ps(aPtr);
-    z = aVal;
-    condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
-    z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
-    condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
-    x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
-    for(i = 0; i < 2; i++){
-      x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
-    }
-    x = _mm256_div_ps(fones, x);
-    y = fzeroes;
-    for(j = TERMS - 1; j >=0 ; j--){
-      y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    int i, j;
+
+    __m256 aVal, pio2, x, y, z, arctangent;
+    __m256 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm256_setzero_ps();
+    fones = _mm256_set1_ps(1.0);
+    ftwos = _mm256_set1_ps(2.0);
+    ffours = _mm256_set1_ps(4.0);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        z = aVal;
+        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+        x = _mm256_add_ps(
+            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++) {
+            x = _mm256_add_ps(x,
+                              _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+        }
+        x = _mm256_div_ps(fones, x);
+        y = fzeroes;
+        for (j = TERMS - 1; j >= 0; j--) {
+            y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
+                              _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+        }
+
+        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+        y = _mm256_add_ps(
+            y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
+        arctangent = y;
+        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+        arctangent = _mm256_sub_ps(
+            arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
+
+        _mm256_store_ps(bPtr, arctangent);
+        aPtr += 8;
+        bPtr += 8;
      }
  
-    y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
-    condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
-
-    y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
-    arctangent = y;
-    condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
-    arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
-
-    _mm256_store_ps(bPtr, arctangent);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *bPtr++ = atan(*aPtr++);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = atan(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX for aligned */
@@ -204,56 +214,58 @@ volk_32f_atan_32f_a_avx(float* bVector, const float* aVector, unsigned int num_p
  static inline void
  volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int quarterPoints = num_points / 4;
-  int i, j;
-
-  __m128 aVal, pio2, x, y, z, arctangent;
-  __m128 fzeroes, fones, ftwos, ffours, condition;
-
-  pio2 = _mm_set1_ps(3.14159265358979323846/2);
-  fzeroes = _mm_setzero_ps();
-  fones = _mm_set1_ps(1.0);
-  ftwos = _mm_set1_ps(2.0);
-  ffours = _mm_set1_ps(4.0);
-
-  for(;number < quarterPoints; number++){
-    aVal = _mm_load_ps(aPtr);
-    z = aVal;
-    condition = _mm_cmplt_ps(z, fzeroes);
-    z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
-    condition = _mm_cmplt_ps(z, fones);
-    x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
-
-    for(i = 0; i < 2; i++){
-      x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
-    }
-    x = _mm_div_ps(fones, x);
-    y = fzeroes;
-    for(j = TERMS - 1; j >=0 ; j--){
-      y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    int i, j;
+
+    __m128 aVal, pio2, x, y, z, arctangent;
+    __m128 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm_setzero_ps();
+    fones = _mm_set1_ps(1.0);
+    ftwos = _mm_set1_ps(2.0);
+    ffours = _mm_set1_ps(4.0);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        z = aVal;
+        condition = _mm_cmplt_ps(z, fzeroes);
+        z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+        condition = _mm_cmplt_ps(z, fones);
+        x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++) {
+            x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+        }
+        x = _mm_div_ps(fones, x);
+        y = fzeroes;
+        for (j = TERMS - 1; j >= 0; j--) {
+            y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
+                           _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
+        }
+
+        y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+        condition = _mm_cmpgt_ps(z, fones);
+
+        y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+        arctangent = y;
+        condition = _mm_cmplt_ps(aVal, fzeroes);
+        arctangent =
+            _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
+
+        _mm_store_ps(bPtr, arctangent);
+        aPtr += 4;
+        bPtr += 4;
      }
  
-    y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
-    condition = _mm_cmpgt_ps(z, fones);
-
-    y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
-    arctangent = y;
-    condition = _mm_cmplt_ps(aVal, fzeroes);
-    arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
-
-    _mm_store_ps(bPtr, arctangent);
-    aPtr += 4;
-    bPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *bPtr++ = atanf(*aPtr++);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = atanf(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_SSE4_1 for aligned */
@@ -266,59 +278,63 @@ volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu
  #if LV_HAVE_AVX2 && LV_HAVE_FMA
  #include <immintrin.h>
  
-static inline void
-volk_32f_atan_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_atan_32f_u_avx2_fma(float* bVector,
+                                                const float* aVector,
+                                                unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int eighthPoints = num_points / 8;
-  int i, j;
-
-  __m256 aVal, pio2, x, y, z, arctangent;
-  __m256 fzeroes, fones, ftwos, ffours, condition;
-
-  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
-  fzeroes = _mm256_setzero_ps();
-  fones = _mm256_set1_ps(1.0);
-  ftwos = _mm256_set1_ps(2.0);
-  ffours = _mm256_set1_ps(4.0);
-
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_loadu_ps(aPtr);
-    z = aVal;
-    condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
-    z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
-    condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
-    x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
-    for(i = 0; i < 2; i++){
-      x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones)));
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    int i, j;
+
+    __m256 aVal, pio2, x, y, z, arctangent;
+    __m256 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm256_setzero_ps();
+    fones = _mm256_set1_ps(1.0);
+    ftwos = _mm256_set1_ps(2.0);
+    ffours = _mm256_set1_ps(4.0);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        z = aVal;
+        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+        x = _mm256_add_ps(
+            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++) {
+            x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
+        }
+        x = _mm256_div_ps(fones, x);
+        y = fzeroes;
+        for (j = TERMS - 1; j >= 0; j--) {
+            y = _mm256_fmadd_ps(
+                y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+        }
+
+        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+        y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
+        arctangent = y;
+        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+        arctangent = _mm256_sub_ps(
+            arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
+
+        _mm256_storeu_ps(bPtr, arctangent);
+        aPtr += 8;
+        bPtr += 8;
      }
-    x = _mm256_div_ps(fones, x);
-    y = fzeroes;
-    for(j = TERMS - 1; j >=0 ; j--){
-      y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
-    }
-
-    y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
-    condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
  
-    y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition));
-    arctangent = y;
-    condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
-    arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
-
-    _mm256_storeu_ps(bPtr, arctangent);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *bPtr++ = atan(*aPtr++);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = atan(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
@@ -330,56 +346,61 @@ volk_32f_atan_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int
  static inline void
  volk_32f_atan_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int eighthPoints = num_points / 8;
-  int i, j;
-
-  __m256 aVal, pio2, x, y, z, arctangent;
-  __m256 fzeroes, fones, ftwos, ffours, condition;
-
-  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
-  fzeroes = _mm256_setzero_ps();
-  fones = _mm256_set1_ps(1.0);
-  ftwos = _mm256_set1_ps(2.0);
-  ffours = _mm256_set1_ps(4.0);
-
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_loadu_ps(aPtr);
-    z = aVal;
-    condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
-    z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
-    condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
-    x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
-    for(i = 0; i < 2; i++){
-      x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
-    }
-    x = _mm256_div_ps(fones, x);
-    y = fzeroes;
-    for(j = TERMS - 1; j >=0 ; j--){
-      y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    int i, j;
+
+    __m256 aVal, pio2, x, y, z, arctangent;
+    __m256 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm256_setzero_ps();
+    fones = _mm256_set1_ps(1.0);
+    ftwos = _mm256_set1_ps(2.0);
+    ffours = _mm256_set1_ps(4.0);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        z = aVal;
+        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+        x = _mm256_add_ps(
+            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++) {
+            x = _mm256_add_ps(x,
+                              _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+        }
+        x = _mm256_div_ps(fones, x);
+        y = fzeroes;
+        for (j = TERMS - 1; j >= 0; j--) {
+            y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
+                              _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+        }
+
+        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+        y = _mm256_add_ps(
+            y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
+        arctangent = y;
+        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+        arctangent = _mm256_sub_ps(
+            arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
+
+        _mm256_storeu_ps(bPtr, arctangent);
+        aPtr += 8;
+        bPtr += 8;
      }
  
-    y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
-    condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
-
-    y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
-    arctangent = y;
-    condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
-    arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
-
-    _mm256_storeu_ps(bPtr, arctangent);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *bPtr++ = atan(*aPtr++);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = atan(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX for unaligned */
@@ -390,54 +411,56 @@ volk_32f_atan_32f_u_avx(float* bVector, const float* aVector, unsigned int num_p
  static inline void
  volk_32f_atan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int quarterPoints = num_points / 4;
-  int i, j;
-
-  __m128 aVal, pio2, x, y, z, arctangent;
-  __m128 fzeroes, fones, ftwos, ffours, condition;
-
-  pio2 = _mm_set1_ps(3.14159265358979323846/2);
-  fzeroes = _mm_setzero_ps();
-  fones = _mm_set1_ps(1.0);
-  ftwos = _mm_set1_ps(2.0);
-  ffours = _mm_set1_ps(4.0);
-
-  for(;number < quarterPoints; number++){
-    aVal = _mm_loadu_ps(aPtr);
-    z = aVal;
-    condition = _mm_cmplt_ps(z, fzeroes);
-    z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
-    condition = _mm_cmplt_ps(z, fones);
-    x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
-
-    for(i = 0; i < 2; i++)
-      x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
-    x = _mm_div_ps(fones, x);
-    y = fzeroes;
-    for(j = TERMS - 1; j >= 0; j--)
-      y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
-
-    y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
-    condition = _mm_cmpgt_ps(z, fones);
-
-    y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
-    arctangent = y;
-    condition = _mm_cmplt_ps(aVal, fzeroes);
-    arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
-
-    _mm_storeu_ps(bPtr, arctangent);
-    aPtr += 4;
-    bPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *bPtr++ = atanf(*aPtr++);
-  }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    int i, j;
+
+    __m128 aVal, pio2, x, y, z, arctangent;
+    __m128 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm_setzero_ps();
+    fones = _mm_set1_ps(1.0);
+    ftwos = _mm_set1_ps(2.0);
+    ffours = _mm_set1_ps(4.0);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_loadu_ps(aPtr);
+        z = aVal;
+        condition = _mm_cmplt_ps(z, fzeroes);
+        z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+        condition = _mm_cmplt_ps(z, fones);
+        x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++)
+            x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+        x = _mm_div_ps(fones, x);
+        y = fzeroes;
+        for (j = TERMS - 1; j >= 0; j--)
+            y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
+                           _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
+
+        y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+        condition = _mm_cmpgt_ps(z, fones);
+
+        y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+        arctangent = y;
+        condition = _mm_cmplt_ps(aVal, fzeroes);
+        arctangent =
+            _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
+
+        _mm_storeu_ps(bPtr, arctangent);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = atanf(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_SSE4_1 for unaligned */
@@ -447,13 +470,13 @@ volk_32f_atan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int nu
  static inline void
  volk_32f_atan_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
  
-  for(number = 0; number < num_points; number++){
-    *bPtr++ = atanf(*aPtr++);
-  }
+    for (number = 0; number < num_points; number++) {
+        *bPtr++ = atanf(*aPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
diff --git a/kernels/volk/volk_32f_binary_slicer_32i.h b/kernels/volk/volk_32f_binary_slicer_32i.h

index c56ff8f88508b0be4ebfa61492da3e79c3fd9a15..635d0c3551a8df4f43ab6db91e84115c41504df3 100644 (file)
--- a/kernels/volk/volk_32f_binary_slicer_32i.h
+++ b/kernels/volk/volk_32f_binary_slicer_32i.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_binary_slicer_32i(int* cVector, const float* aVector, unsigned int num_points)
- * \endcode
+ * void volk_32f_binary_slicer_32i(int* cVector, const float* aVector, unsigned int
+ * num_points) \endcode
   *
   * \b Inputs
   * \li aVector: The input vector of floats.
@@ -73,37 +73,38 @@
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_binary_slicer_32i_generic(int* cVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_binary_slicer_32i_generic(int* cVector,
+                                                      const float* aVector,
+                                                      unsigned int num_points)
  {
-  int* cPtr = cVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    if( *aPtr++ >= 0) {
-      *cPtr++ = 1;
-    }
-    else {
-      *cPtr++ = 0;
+    int* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        if (*aPtr++ >= 0) {
+            *cPtr++ = 1;
+        } else {
+            *cPtr++ = 0;
+        }
      }
-  }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_binary_slicer_32i_generic_branchless(int* cVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_binary_slicer_32i_generic_branchless(int* cVector,
+                                                                 const float* aVector,
+                                                                 unsigned int num_points)
  {
-  int* cPtr = cVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
+    int* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
  
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = (*aPtr++ >= 0);
-  }
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++ >= 0);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -111,40 +112,40 @@ volk_32f_binary_slicer_32i_generic_branchless(int* cVector, const float* aVector
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void
-volk_32f_binary_slicer_32i_a_sse2(int* cVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_binary_slicer_32i_a_sse2(int* cVector,
+                                                     const float* aVector,
+                                                     unsigned int num_points)
  {
-  int* cPtr = cVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
-
-  unsigned int quarter_points = num_points / 4;
-  __m128 a_val, res_f;
-  __m128i res_i, binary_i;
-  __m128 zero_val;
-  zero_val = _mm_set1_ps (0.0f);
+    int* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
  
-  for(number = 0; number < quarter_points; number++){
-    a_val = _mm_load_ps(aPtr);
+    unsigned int quarter_points = num_points / 4;
+    __m128 a_val, res_f;
+    __m128i res_i, binary_i;
+    __m128 zero_val;
+    zero_val = _mm_set1_ps(0.0f);
  
-    res_f = _mm_cmpge_ps (a_val, zero_val);
-    res_i = _mm_cvtps_epi32 (res_f);
-    binary_i = _mm_srli_epi32 (res_i, 31);
+    for (number = 0; number < quarter_points; number++) {
+        a_val = _mm_load_ps(aPtr);
  
-    _mm_store_si128((__m128i*)cPtr, binary_i);
+        res_f = _mm_cmpge_ps(a_val, zero_val);
+        res_i = _mm_cvtps_epi32(res_f);
+        binary_i = _mm_srli_epi32(res_i, 31);
  
-    cPtr += 4;
-    aPtr += 4;
-  }
+        _mm_store_si128((__m128i*)cPtr, binary_i);
  
-  for(number = quarter_points * 4; number < num_points; number++){
-    if( *aPtr++ >= 0) {
-      *cPtr++ = 1;
+        cPtr += 4;
+        aPtr += 4;
      }
-    else {
-      *cPtr++ = 0;
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        if (*aPtr++ >= 0) {
+            *cPtr++ = 1;
+        } else {
+            *cPtr++ = 0;
+        }
      }
-  }
  }
  #endif /* LV_HAVE_SSE2 */
  
@@ -152,41 +153,41 @@ volk_32f_binary_slicer_32i_a_sse2(int* cVector, const float* aVector, unsigned i
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_binary_slicer_32i_a_avx(int* cVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_binary_slicer_32i_a_avx(int* cVector,
+                                                    const float* aVector,
+                                                    unsigned int num_points)
  {
-  int* cPtr = cVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
+    int* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
  
-  unsigned int quarter_points = num_points / 8;
-  __m256 a_val, res_f, binary_f;
-  __m256i binary_i;
-  __m256 zero_val, one_val;
-  zero_val = _mm256_set1_ps (0.0f);
-  one_val = _mm256_set1_ps (1.0f);
+    unsigned int quarter_points = num_points / 8;
+    __m256 a_val, res_f, binary_f;
+    __m256i binary_i;
+    __m256 zero_val, one_val;
+    zero_val = _mm256_set1_ps(0.0f);
+    one_val = _mm256_set1_ps(1.0f);
  
-  for(number = 0; number < quarter_points; number++){
-    a_val = _mm256_load_ps(aPtr);
+    for (number = 0; number < quarter_points; number++) {
+        a_val = _mm256_load_ps(aPtr);
  
-    res_f = _mm256_cmp_ps (a_val, zero_val, _CMP_GE_OS);
-    binary_f = _mm256_and_ps (res_f, one_val);
-    binary_i = _mm256_cvtps_epi32(binary_f);
+        res_f = _mm256_cmp_ps(a_val, zero_val, _CMP_GE_OS);
+        binary_f = _mm256_and_ps(res_f, one_val);
+        binary_i = _mm256_cvtps_epi32(binary_f);
  
-    _mm256_store_si256((__m256i *)cPtr, binary_i);
+        _mm256_store_si256((__m256i*)cPtr, binary_i);
  
-    cPtr += 8;
-    aPtr += 8;
-  }
-
-  for(number = quarter_points * 8; number < num_points; number++){
-    if( *aPtr++ >= 0) {
-      *cPtr++ = 1;
+        cPtr += 8;
+        aPtr += 8;
      }
-    else {
-      *cPtr++ = 0;
+
+    for (number = quarter_points * 8; number < num_points; number++) {
+        if (*aPtr++ >= 0) {
+            *cPtr++ = 1;
+        } else {
+            *cPtr++ = 0;
+        }
      }
-  }
  }
  #endif /* LV_HAVE_AVX */
  
@@ -194,40 +195,40 @@ volk_32f_binary_slicer_32i_a_avx(int* cVector, const float* aVector, unsigned in
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void
-volk_32f_binary_slicer_32i_u_sse2(int* cVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_binary_slicer_32i_u_sse2(int* cVector,
+                                                     const float* aVector,
+                                                     unsigned int num_points)
  {
-  int* cPtr = cVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
-
-  unsigned int quarter_points = num_points / 4;
-  __m128 a_val, res_f;
-  __m128i res_i, binary_i;
-  __m128 zero_val;
-  zero_val = _mm_set1_ps (0.0f);
+    int* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
  
-  for(number = 0; number < quarter_points; number++){
-    a_val = _mm_loadu_ps(aPtr);
+    unsigned int quarter_points = num_points / 4;
+    __m128 a_val, res_f;
+    __m128i res_i, binary_i;
+    __m128 zero_val;
+    zero_val = _mm_set1_ps(0.0f);
  
-    res_f = _mm_cmpge_ps (a_val, zero_val);
-    res_i = _mm_cvtps_epi32 (res_f);
-    binary_i = _mm_srli_epi32 (res_i, 31);
+    for (number = 0; number < quarter_points; number++) {
+        a_val = _mm_loadu_ps(aPtr);
  
-    _mm_storeu_si128((__m128i*)cPtr, binary_i);
+        res_f = _mm_cmpge_ps(a_val, zero_val);
+        res_i = _mm_cvtps_epi32(res_f);
+        binary_i = _mm_srli_epi32(res_i, 31);
  
-    cPtr += 4;
-    aPtr += 4;
-  }
+        _mm_storeu_si128((__m128i*)cPtr, binary_i);
  
-  for(number = quarter_points * 4; number < num_points; number++){
-    if( *aPtr++ >= 0) {
-      *cPtr++ = 1;
+        cPtr += 4;
+        aPtr += 4;
      }
-    else {
-      *cPtr++ = 0;
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        if (*aPtr++ >= 0) {
+            *cPtr++ = 1;
+        } else {
+            *cPtr++ = 0;
+        }
      }
-  }
  }
  #endif /* LV_HAVE_SSE2 */
  
@@ -235,41 +236,41 @@ volk_32f_binary_slicer_32i_u_sse2(int* cVector, const float* aVector, unsigned i
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_binary_slicer_32i_u_avx(int* cVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_binary_slicer_32i_u_avx(int* cVector,
+                                                    const float* aVector,
+                                                    unsigned int num_points)
  {
-  int* cPtr = cVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
-
-  unsigned int quarter_points = num_points / 8;
-  __m256 a_val, res_f, binary_f;
-  __m256i binary_i;
-  __m256 zero_val, one_val;
-  zero_val = _mm256_set1_ps (0.0f);
-  one_val = _mm256_set1_ps (1.0f);
+    int* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
  
-  for(number = 0; number < quarter_points; number++){
-    a_val = _mm256_loadu_ps(aPtr);
+    unsigned int quarter_points = num_points / 8;
+    __m256 a_val, res_f, binary_f;
+    __m256i binary_i;
+    __m256 zero_val, one_val;
+    zero_val = _mm256_set1_ps(0.0f);
+    one_val = _mm256_set1_ps(1.0f);
  
-    res_f = _mm256_cmp_ps (a_val, zero_val, _CMP_GE_OS);
-    binary_f = _mm256_and_ps (res_f, one_val);
-    binary_i = _mm256_cvtps_epi32(binary_f);
+    for (number = 0; number < quarter_points; number++) {
+        a_val = _mm256_loadu_ps(aPtr);
  
-    _mm256_storeu_si256((__m256i*)cPtr, binary_i);
+        res_f = _mm256_cmp_ps(a_val, zero_val, _CMP_GE_OS);
+        binary_f = _mm256_and_ps(res_f, one_val);
+        binary_i = _mm256_cvtps_epi32(binary_f);
  
-    cPtr += 8;
-    aPtr += 8;
-  }
+        _mm256_storeu_si256((__m256i*)cPtr, binary_i);
  
-  for(number = quarter_points * 8; number < num_points; number++){
-    if( *aPtr++ >= 0) {
-      *cPtr++ = 1;
+        cPtr += 8;
+        aPtr += 8;
      }
-    else {
-      *cPtr++ = 0;
+
+    for (number = quarter_points * 8; number < num_points; number++) {
+        if (*aPtr++ >= 0) {
+            *cPtr++ = 1;
+        } else {
+            *cPtr++ = 0;
+        }
      }
-  }
  }
  #endif /* LV_HAVE_AVX */
  
diff --git a/kernels/volk/volk_32f_binary_slicer_8i.h b/kernels/volk/volk_32f_binary_slicer_8i.h

index 5920621fcf18ccebc34ad0fb103046bba4746984..3eddb5c6b086d1cc008e832d623106cb2afba574 100644 (file)
--- a/kernels/volk/volk_32f_binary_slicer_8i.h
+++ b/kernels/volk/volk_32f_binary_slicer_8i.h
@@ -30,7 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_binary_slicer_8i(int8_t* cVector, const float* aVector, unsigned int num_points)
+ * void volk_32f_binary_slicer_8i(int8_t* cVector, const float* aVector, unsigned int
+ num_points)
   * \endcode
   *
   * \b Inputs
@@ -74,39 +75,38 @@
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_binary_slicer_8i_generic(int8_t* cVector, const float* aVector,
-                                  unsigned int num_points)
+static inline void volk_32f_binary_slicer_8i_generic(int8_t* cVector,
+                                                     const float* aVector,
+                                                     unsigned int num_points)
  {
-  int8_t* cPtr = cVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++) {
-    if(*aPtr++ >= 0) {
-      *cPtr++ = 1;
-    }
-    else {
-      *cPtr++ = 0;
+    int8_t* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        if (*aPtr++ >= 0) {
+            *cPtr++ = 1;
+        } else {
+            *cPtr++ = 0;
+        }
      }
-  }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector, const float* aVector,
-                                             unsigned int num_points)
+static inline void volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector,
+                                                                const float* aVector,
+                                                                unsigned int num_points)
  {
-  int8_t* cPtr = cVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
+    int8_t* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
  
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = (*aPtr++ >= 0);
-  }
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++ >= 0);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -114,279 +114,329 @@ volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector, const float* aVect
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_32f_binary_slicer_8i_a_avx2(int8_t* cVector, const float* aVector,
-                                 unsigned int num_points)
+static inline void volk_32f_binary_slicer_8i_a_avx2(int8_t* cVector,
+                                                    const float* aVector,
+                                                    unsigned int num_points)
  {
-  int8_t* cPtr = cVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
-  unsigned int n32points = num_points / 32;
-
-  const __m256 zero_val = _mm256_set1_ps(0.0f);
-  __m256 a0_val, a1_val, a2_val, a3_val;
-  __m256 res0_f, res1_f, res2_f, res3_f;
-  __m256i res0_i, res1_i, res2_i, res3_i;
-  __m256i byte_shuffle = _mm256_set_epi8( 15, 14, 13, 12, 7, 6, 5, 4,
-                                        11, 10, 9, 8, 3, 2, 1, 0,
-                                        15, 14, 13, 12, 7, 6, 5, 4,
-                                        11, 10, 9, 8, 3, 2, 1, 0);
-
-  for(number = 0; number < n32points; number++) {
-    a0_val = _mm256_load_ps(aPtr);
-    a1_val = _mm256_load_ps(aPtr+8);
-    a2_val = _mm256_load_ps(aPtr+16);
-    a3_val = _mm256_load_ps(aPtr+24);
-
-    // compare >= 0; return float
-    res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
-    res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
-    res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
-    res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
-
-    // convert to 32i and >> 31
-    res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
-    res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
-    res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
-    res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
-
-    // pack in to 16-bit results
-    res0_i = _mm256_packs_epi32(res0_i, res1_i);
-    res2_i = _mm256_packs_epi32(res2_i, res3_i);
-    // pack in to 8-bit results
-    // res0: (after packs_epi32)
-    //  a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
-    // res2:
-    //  c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
-    res0_i = _mm256_packs_epi16(res0_i, res2_i);
-    // shuffle the lanes
-    // res0: (after packs_epi16)
-    //  a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
-    //  a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
-    //   0, 2, 1, 3 -> 11 01 10 00 (0xd8)
-    res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
-
-    // shuffle bytes within lanes
-    // res0: (after shuffle_epi8)
-    //  a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
-    //  c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
-    res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
-
-    _mm256_store_si256((__m256i*)cPtr, res0_i);
-    aPtr += 32;
-    cPtr += 32;
-  }
-
-  for(number = n32points * 32; number < num_points; number++) {
-    if( *aPtr++ >= 0) {
-      *cPtr++ = 1;
+    int8_t* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+    unsigned int n32points = num_points / 32;
+
+    const __m256 zero_val = _mm256_set1_ps(0.0f);
+    __m256 a0_val, a1_val, a2_val, a3_val;
+    __m256 res0_f, res1_f, res2_f, res3_f;
+    __m256i res0_i, res1_i, res2_i, res3_i;
+    __m256i byte_shuffle = _mm256_set_epi8(15,
+                                           14,
+                                           13,
+                                           12,
+                                           7,
+                                           6,
+                                           5,
+                                           4,
+                                           11,
+                                           10,
+                                           9,
+                                           8,
+                                           3,
+                                           2,
+                                           1,
+                                           0,
+                                           15,
+                                           14,
+                                           13,
+                                           12,
+                                           7,
+                                           6,
+                                           5,
+                                           4,
+                                           11,
+                                           10,
+                                           9,
+                                           8,
+                                           3,
+                                           2,
+                                           1,
+                                           0);
+
+    for (number = 0; number < n32points; number++) {
+        a0_val = _mm256_load_ps(aPtr);
+        a1_val = _mm256_load_ps(aPtr + 8);
+        a2_val = _mm256_load_ps(aPtr + 16);
+        a3_val = _mm256_load_ps(aPtr + 24);
+
+        // compare >= 0; return float
+        res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
+        res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
+        res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
+        res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
+
+        // convert to 32i and >> 31
+        res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
+        res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
+        res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
+        res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
+
+        // pack in to 16-bit results
+        res0_i = _mm256_packs_epi32(res0_i, res1_i);
+        res2_i = _mm256_packs_epi32(res2_i, res3_i);
+        // pack in to 8-bit results
+        // res0: (after packs_epi32)
+        //  a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
+        // res2:
+        //  c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
+        res0_i = _mm256_packs_epi16(res0_i, res2_i);
+        // shuffle the lanes
+        // res0: (after packs_epi16)
+        //  a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
+        //  a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
+        //   0, 2, 1, 3 -> 11 01 10 00 (0xd8)
+        res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
+
+        // shuffle bytes within lanes
+        // res0: (after shuffle_epi8)
+        //  a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
+        //  c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
+        res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
+
+        _mm256_store_si256((__m256i*)cPtr, res0_i);
+        aPtr += 32;
+        cPtr += 32;
      }
-    else {
-      *cPtr++ = 0;
+
+    for (number = n32points * 32; number < num_points; number++) {
+        if (*aPtr++ >= 0) {
+            *cPtr++ = 1;
+        } else {
+            *cPtr++ = 0;
+        }
      }
-  }
  }
  #endif
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_32f_binary_slicer_8i_u_avx2(int8_t* cVector, const float* aVector,
-                                 unsigned int num_points)
+static inline void volk_32f_binary_slicer_8i_u_avx2(int8_t* cVector,
+                                                    const float* aVector,
+                                                    unsigned int num_points)
  {
-  int8_t* cPtr = cVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
-  unsigned int n32points = num_points / 32;
-
-  const __m256 zero_val = _mm256_set1_ps(0.0f);
-  __m256 a0_val, a1_val, a2_val, a3_val;
-  __m256 res0_f, res1_f, res2_f, res3_f;
-  __m256i res0_i, res1_i, res2_i, res3_i;
-  __m256i byte_shuffle = _mm256_set_epi8( 15, 14, 13, 12, 7, 6, 5, 4,
-                                        11, 10, 9, 8, 3, 2, 1, 0,
-                                        15, 14, 13, 12, 7, 6, 5, 4,
-                                        11, 10, 9, 8, 3, 2, 1, 0);
-
-  for(number = 0; number < n32points; number++) {
-    a0_val = _mm256_loadu_ps(aPtr);
-    a1_val = _mm256_loadu_ps(aPtr+8);
-    a2_val = _mm256_loadu_ps(aPtr+16);
-    a3_val = _mm256_loadu_ps(aPtr+24);
-
-    // compare >= 0; return float
-    res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
-    res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
-    res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
-    res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
-
-    // convert to 32i and >> 31
-    res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
-    res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
-    res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
-    res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
-
-    // pack in to 16-bit results
-    res0_i = _mm256_packs_epi32(res0_i, res1_i);
-    res2_i = _mm256_packs_epi32(res2_i, res3_i);
-    // pack in to 8-bit results
-    // res0: (after packs_epi32)
-    //  a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
-    // res2:
-    //  c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
-    res0_i = _mm256_packs_epi16(res0_i, res2_i);
-    // shuffle the lanes
-    // res0: (after packs_epi16)
-    //  a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
-    //  a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
-    //   0, 2, 1, 3 -> 11 01 10 00 (0xd8)
-    res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
-
-    // shuffle bytes within lanes
-    // res0: (after shuffle_epi8)
-    //  a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
-    //  c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
-    res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
-
-    _mm256_storeu_si256((__m256i*)cPtr, res0_i);
-    aPtr += 32;
-    cPtr += 32;
-  }
-
-  for(number = n32points * 32; number < num_points; number++) {
-    if( *aPtr++ >= 0) {
-      *cPtr++ = 1;
+    int8_t* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+    unsigned int n32points = num_points / 32;
+
+    const __m256 zero_val = _mm256_set1_ps(0.0f);
+    __m256 a0_val, a1_val, a2_val, a3_val;
+    __m256 res0_f, res1_f, res2_f, res3_f;
+    __m256i res0_i, res1_i, res2_i, res3_i;
+    __m256i byte_shuffle = _mm256_set_epi8(15,
+                                           14,
+                                           13,
+                                           12,
+                                           7,
+                                           6,
+                                           5,
+                                           4,
+                                           11,
+                                           10,
+                                           9,
+                                           8,
+                                           3,
+                                           2,
+                                           1,
+                                           0,
+                                           15,
+                                           14,
+                                           13,
+                                           12,
+                                           7,
+                                           6,
+                                           5,
+                                           4,
+                                           11,
+                                           10,
+                                           9,
+                                           8,
+                                           3,
+                                           2,
+                                           1,
+                                           0);
+
+    for (number = 0; number < n32points; number++) {
+        a0_val = _mm256_loadu_ps(aPtr);
+        a1_val = _mm256_loadu_ps(aPtr + 8);
+        a2_val = _mm256_loadu_ps(aPtr + 16);
+        a3_val = _mm256_loadu_ps(aPtr + 24);
+
+        // compare >= 0; return float
+        res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
+        res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
+        res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
+        res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
+
+        // convert to 32i and >> 31
+        res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
+        res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
+        res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
+        res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
+
+        // pack in to 16-bit results
+        res0_i = _mm256_packs_epi32(res0_i, res1_i);
+        res2_i = _mm256_packs_epi32(res2_i, res3_i);
+        // pack in to 8-bit results
+        // res0: (after packs_epi32)
+        //  a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
+        // res2:
+        //  c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
+        res0_i = _mm256_packs_epi16(res0_i, res2_i);
+        // shuffle the lanes
+        // res0: (after packs_epi16)
+        //  a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
+        //  a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
+        //   0, 2, 1, 3 -> 11 01 10 00 (0xd8)
+        res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
+
+        // shuffle bytes within lanes
+        // res0: (after shuffle_epi8)
+        //  a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
+        //  c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
+        res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
+
+        _mm256_storeu_si256((__m256i*)cPtr, res0_i);
+        aPtr += 32;
+        cPtr += 32;
      }
-    else {
-      *cPtr++ = 0;
+
+    for (number = n32points * 32; number < num_points; number++) {
+        if (*aPtr++ >= 0) {
+            *cPtr++ = 1;
+        } else {
+            *cPtr++ = 0;
+        }
      }
-  }
  }
  #endif
  
  
-
  #ifdef LV_HAVE_SSE2
  
  #include <emmintrin.h>
  
-static inline void
-volk_32f_binary_slicer_8i_a_sse2(int8_t* cVector, const float* aVector,
-                                 unsigned int num_points)
+static inline void volk_32f_binary_slicer_8i_a_sse2(int8_t* cVector,
+                                                    const float* aVector,
+                                                    unsigned int num_points)
  {
-  int8_t* cPtr = cVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
-
-  unsigned int n16points = num_points / 16;
-  __m128 a0_val, a1_val, a2_val, a3_val;
-  __m128 res0_f, res1_f, res2_f, res3_f;
-  __m128i res0_i, res1_i, res2_i, res3_i;
-  __m128 zero_val;
-  zero_val = _mm_set1_ps(0.0f);
-
-  for(number = 0; number < n16points; number++) {
-    a0_val = _mm_load_ps(aPtr);
-    a1_val = _mm_load_ps(aPtr+4);
-    a2_val = _mm_load_ps(aPtr+8);
-    a3_val = _mm_load_ps(aPtr+12);
-
-    // compare >= 0; return float
-    res0_f = _mm_cmpge_ps(a0_val, zero_val);
-    res1_f = _mm_cmpge_ps(a1_val, zero_val);
-    res2_f = _mm_cmpge_ps(a2_val, zero_val);
-    res3_f = _mm_cmpge_ps(a3_val, zero_val);
-
-    // convert to 32i and >> 31
-    res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
-    res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
-    res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
-    res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
-
-    // pack into 16-bit results
-    res0_i = _mm_packs_epi32(res0_i, res1_i);
-    res2_i = _mm_packs_epi32(res2_i, res3_i);
-
-    // pack into 8-bit results
-    res0_i = _mm_packs_epi16(res0_i, res2_i);
-
-    _mm_store_si128((__m128i*)cPtr, res0_i);
-
-    cPtr += 16;
-    aPtr += 16;
-  }
-
-  for(number = n16points * 16; number < num_points; number++) {
-    if( *aPtr++ >= 0) {
-      *cPtr++ = 1;
+    int8_t* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    unsigned int n16points = num_points / 16;
+    __m128 a0_val, a1_val, a2_val, a3_val;
+    __m128 res0_f, res1_f, res2_f, res3_f;
+    __m128i res0_i, res1_i, res2_i, res3_i;
+    __m128 zero_val;
+    zero_val = _mm_set1_ps(0.0f);
+
+    for (number = 0; number < n16points; number++) {
+        a0_val = _mm_load_ps(aPtr);
+        a1_val = _mm_load_ps(aPtr + 4);
+        a2_val = _mm_load_ps(aPtr + 8);
+        a3_val = _mm_load_ps(aPtr + 12);
+
+        // compare >= 0; return float
+        res0_f = _mm_cmpge_ps(a0_val, zero_val);
+        res1_f = _mm_cmpge_ps(a1_val, zero_val);
+        res2_f = _mm_cmpge_ps(a2_val, zero_val);
+        res3_f = _mm_cmpge_ps(a3_val, zero_val);
+
+        // convert to 32i and >> 31
+        res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
+        res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
+        res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
+        res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
+
+        // pack into 16-bit results
+        res0_i = _mm_packs_epi32(res0_i, res1_i);
+        res2_i = _mm_packs_epi32(res2_i, res3_i);
+
+        // pack into 8-bit results
+        res0_i = _mm_packs_epi16(res0_i, res2_i);
+
+        _mm_store_si128((__m128i*)cPtr, res0_i);
+
+        cPtr += 16;
+        aPtr += 16;
      }
-    else {
-      *cPtr++ = 0;
+
+    for (number = n16points * 16; number < num_points; number++) {
+        if (*aPtr++ >= 0) {
+            *cPtr++ = 1;
+        } else {
+            *cPtr++ = 0;
+        }
      }
-  }
  }
  #endif /* LV_HAVE_SSE2 */
  
  
-
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void
-volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector, const float* aVector,
-                                  unsigned int num_points)
+static inline void volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector,
+                                                    const float* aVector,
+                                                    unsigned int num_points)
  {
-  int8_t* cPtr = cVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
-
-  unsigned int n16points = num_points / 16;
-  __m128 a0_val, a1_val, a2_val, a3_val;
-  __m128 res0_f, res1_f, res2_f, res3_f;
-  __m128i res0_i, res1_i, res2_i, res3_i;
-  __m128 zero_val;
-  zero_val = _mm_set1_ps (0.0f);
-
-  for(number = 0; number < n16points; number++) {
-    a0_val = _mm_loadu_ps(aPtr);
-    a1_val = _mm_loadu_ps(aPtr+4);
-    a2_val = _mm_loadu_ps(aPtr+8);
-    a3_val = _mm_loadu_ps(aPtr+12);
-
-    // compare >= 0; return float
-    res0_f = _mm_cmpge_ps(a0_val, zero_val);
-    res1_f = _mm_cmpge_ps(a1_val, zero_val);
-    res2_f = _mm_cmpge_ps(a2_val, zero_val);
-    res3_f = _mm_cmpge_ps(a3_val, zero_val);
-
-    // convert to 32i and >> 31
-    res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
-    res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
-    res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
-    res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
-
-    // pack into 16-bit results
-    res0_i = _mm_packs_epi32(res0_i, res1_i);
-    res2_i = _mm_packs_epi32(res2_i, res3_i);
-
-    // pack into 8-bit results
-    res0_i = _mm_packs_epi16(res0_i, res2_i);
-
-    _mm_storeu_si128((__m128i*)cPtr, res0_i);
-
-    cPtr += 16;
-    aPtr += 16;
-  }
-
-  for(number = n16points * 16; number < num_points; number++) {
-    if( *aPtr++ >= 0) {
-      *cPtr++ = 1;
+    int8_t* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    unsigned int n16points = num_points / 16;
+    __m128 a0_val, a1_val, a2_val, a3_val;
+    __m128 res0_f, res1_f, res2_f, res3_f;
+    __m128i res0_i, res1_i, res2_i, res3_i;
+    __m128 zero_val;
+    zero_val = _mm_set1_ps(0.0f);
+
+    for (number = 0; number < n16points; number++) {
+        a0_val = _mm_loadu_ps(aPtr);
+        a1_val = _mm_loadu_ps(aPtr + 4);
+        a2_val = _mm_loadu_ps(aPtr + 8);
+        a3_val = _mm_loadu_ps(aPtr + 12);
+
+        // compare >= 0; return float
+        res0_f = _mm_cmpge_ps(a0_val, zero_val);
+        res1_f = _mm_cmpge_ps(a1_val, zero_val);
+        res2_f = _mm_cmpge_ps(a2_val, zero_val);
+        res3_f = _mm_cmpge_ps(a3_val, zero_val);
+
+        // convert to 32i and >> 31
+        res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
+        res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
+        res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
+        res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
+
+        // pack into 16-bit results
+        res0_i = _mm_packs_epi32(res0_i, res1_i);
+        res2_i = _mm_packs_epi32(res2_i, res3_i);
+
+        // pack into 8-bit results
+        res0_i = _mm_packs_epi16(res0_i, res2_i);
+
+        _mm_storeu_si128((__m128i*)cPtr, res0_i);
+
+        cPtr += 16;
+        aPtr += 16;
      }
-    else {
-      *cPtr++ = 0;
+
+    for (number = n16points * 16; number < num_points; number++) {
+        if (*aPtr++ >= 0) {
+            *cPtr++ = 1;
+        } else {
+            *cPtr++ = 0;
+        }
      }
-  }
  }
  #endif /* LV_HAVE_SSE2 */
  
@@ -394,74 +444,72 @@ volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector, const float* aVector,
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_32f_binary_slicer_8i_neon(int8_t* cVector, const float* aVector,
-                                  unsigned int num_points)
+static inline void volk_32f_binary_slicer_8i_neon(int8_t* cVector,
+                                                  const float* aVector,
+                                                  unsigned int num_points)
  {
-  int8_t* cPtr = cVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
-  unsigned int n16points = num_points / 16;
-
-  float32x4x2_t input_val0, input_val1;
-  float32x4_t zero_val;
-  uint32x4x2_t res0_u32, res1_u32;
-  uint16x4x2_t res0_u16x4, res1_u16x4;
-  uint16x8x2_t res_u16x8;
-  uint8x8x2_t res_u8;
-  uint8x8_t one;
-
-  zero_val = vdupq_n_f32(0.0);
-  one = vdup_n_u8(0x01);
-
-  // TODO: this is a good candidate for asm because the vcombines
-  // can be eliminated simply by picking dst registers that are
-  // adjacent.
-  for(number = 0; number < n16points; number++) {
-    input_val0 = vld2q_f32(aPtr);
-    input_val1 = vld2q_f32(aPtr+8);
-
-    // test against 0; return uint32
-    res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
-    res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
-    res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
-    res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
-
-    // narrow uint32 -> uint16 followed by combine to 8-element vectors
-    res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
-    res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
-    res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
-    res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
-
-    res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
-    res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
-
-    // narrow uint16x8 -> uint8x8
-    res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
-    res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
-    // we *could* load twice as much data and do another vcombine here
-    // to get a uint8x16x2 vector, still only do 2 vandqs and a single store
-    // but that turns out to be ~16% slower than this version on zc702
-    // it's possible register contention in GCC scheduler slows it down
-    // and a hand-written asm with quad-word u8 registers is much faster.
-
-    res_u8.val[0] = vand_u8(one, res_u8.val[0]);
-    res_u8.val[1] = vand_u8(one, res_u8.val[1]);
-
-    vst2_u8((unsigned char*)cPtr, res_u8);
-    cPtr += 16;
-    aPtr += 16;
-
-  }
-
-  for(number = n16points * 16; number < num_points; number++) {
-    if(*aPtr++ >= 0) {
-      *cPtr++ = 1;
+    int8_t* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+    unsigned int n16points = num_points / 16;
+
+    float32x4x2_t input_val0, input_val1;
+    float32x4_t zero_val;
+    uint32x4x2_t res0_u32, res1_u32;
+    uint16x4x2_t res0_u16x4, res1_u16x4;
+    uint16x8x2_t res_u16x8;
+    uint8x8x2_t res_u8;
+    uint8x8_t one;
+
+    zero_val = vdupq_n_f32(0.0);
+    one = vdup_n_u8(0x01);
+
+    // TODO: this is a good candidate for asm because the vcombines
+    // can be eliminated simply by picking dst registers that are
+    // adjacent.
+    for (number = 0; number < n16points; number++) {
+        input_val0 = vld2q_f32(aPtr);
+        input_val1 = vld2q_f32(aPtr + 8);
+
+        // test against 0; return uint32
+        res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
+        res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
+        res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
+        res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
+
+        // narrow uint32 -> uint16 followed by combine to 8-element vectors
+        res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
+        res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
+        res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
+        res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
+
+        res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
+        res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
+
+        // narrow uint16x8 -> uint8x8
+        res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
+        res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
+        // we *could* load twice as much data and do another vcombine here
+        // to get a uint8x16x2 vector, still only do 2 vandqs and a single store
+        // but that turns out to be ~16% slower than this version on zc702
+        // it's possible register contention in GCC scheduler slows it down
+        // and a hand-written asm with quad-word u8 registers is much faster.
+
+        res_u8.val[0] = vand_u8(one, res_u8.val[0]);
+        res_u8.val[1] = vand_u8(one, res_u8.val[1]);
+
+        vst2_u8((unsigned char*)cPtr, res_u8);
+        cPtr += 16;
+        aPtr += 16;
      }
-    else {
-      *cPtr++ = 0;
+
+    for (number = n16points * 16; number < num_points; number++) {
+        if (*aPtr++ >= 0) {
+            *cPtr++ = 1;
+        } else {
+            *cPtr++ = 0;
+        }
      }
-  }
  }
  #endif /* LV_HAVE_NEON */
  
diff --git a/kernels/volk/volk_32f_convert_64f.h b/kernels/volk/volk_32f_convert_64f.h

index bf57e3a2d18cc8f4a513e0a6e00c3a2b410dbc6b..d2e3f8a4bffc6f255e44223ced32a617ce69d68e 100644 (file)
--- a/kernels/volk/volk_32f_convert_64f.h
+++ b/kernels/volk/volk_32f_convert_64f.h
@@ -29,8 +29,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_convert_64f(double* outputVector, const float* inputVector, unsigned int num_points)
- * \endcode
+ * void volk_32f_convert_64f(double* outputVector, const float* inputVector, unsigned int
+ * num_points) \endcode
   *
   * \b Inputs
   * \li inputVector: The vector of floats to convert to doubles.
@@ -72,29 +72,33 @@
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void volk_32f_convert_64f_u_avx(double* outputVector, const float* inputVector, unsigned int num_points){
-  unsigned int number = 0;
+static inline void volk_32f_convert_64f_u_avx(double* outputVector,
+                                              const float* inputVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
  
-  const unsigned int quarterPoints = num_points / 4;
+    const unsigned int quarterPoints = num_points / 4;
  
-  const float* inputVectorPtr = (const float*)inputVector;
-  double* outputVectorPtr = outputVector;
-  __m256d ret;
-  __m128 inputVal;
+    const float* inputVectorPtr = (const float*)inputVector;
+    double* outputVectorPtr = outputVector;
+    __m256d ret;
+    __m128 inputVal;
  
-  for(;number < quarterPoints; number++){
-    inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+    for (; number < quarterPoints; number++) {
+        inputVal = _mm_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 4;
  
-    ret = _mm256_cvtps_pd(inputVal);
-    _mm256_storeu_pd(outputVectorPtr, ret);
+        ret = _mm256_cvtps_pd(inputVal);
+        _mm256_storeu_pd(outputVectorPtr, ret);
  
-    outputVectorPtr += 4;
-  }
+        outputVectorPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    outputVector[number] = (double)(inputVector[number]);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        outputVector[number] = (double)(inputVector[number]);
+    }
  }
  
  #endif /* LV_HAVE_AVX */
@@ -102,56 +106,61 @@ static inline void volk_32f_convert_64f_u_avx(double* outputVector, const float*
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
-  unsigned int number = 0;
+static inline void volk_32f_convert_64f_u_sse2(double* outputVector,
+                                               const float* inputVector,
+                                               unsigned int num_points)
+{
+    unsigned int number = 0;
  
-  const unsigned int quarterPoints = num_points / 4;
+    const unsigned int quarterPoints = num_points / 4;
  
-  const float* inputVectorPtr = (const float*)inputVector;
-  double* outputVectorPtr = outputVector;
-  __m128d ret;
-  __m128 inputVal;
+    const float* inputVectorPtr = (const float*)inputVector;
+    double* outputVectorPtr = outputVector;
+    __m128d ret;
+    __m128 inputVal;
  
-  for(;number < quarterPoints; number++){
-    inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+    for (; number < quarterPoints; number++) {
+        inputVal = _mm_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 4;
  
-    ret = _mm_cvtps_pd(inputVal);
+        ret = _mm_cvtps_pd(inputVal);
  
-    _mm_storeu_pd(outputVectorPtr, ret);
-    outputVectorPtr += 2;
+        _mm_storeu_pd(outputVectorPtr, ret);
+        outputVectorPtr += 2;
  
-    inputVal = _mm_movehl_ps(inputVal, inputVal);
+        inputVal = _mm_movehl_ps(inputVal, inputVal);
  
-    ret = _mm_cvtps_pd(inputVal);
+        ret = _mm_cvtps_pd(inputVal);
  
-    _mm_storeu_pd(outputVectorPtr, ret);
-    outputVectorPtr += 2;
-  }
+        _mm_storeu_pd(outputVectorPtr, ret);
+        outputVectorPtr += 2;
+    }
  
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    outputVector[number] = (double)(inputVector[number]);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        outputVector[number] = (double)(inputVector[number]);
+    }
  }
  #endif /* LV_HAVE_SSE2 */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_32f_convert_64f_generic(double* outputVector, const float* inputVector, unsigned int num_points){
-  double* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((double)(*inputVectorPtr++));
-  }
+static inline void volk_32f_convert_64f_generic(double* outputVector,
+                                                const float* inputVector,
+                                                unsigned int num_points)
+{
+    double* outputVectorPtr = outputVector;
+    const float* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
-
-
  #endif /* INCLUDED_volk_32f_convert_64f_u_H */
  
  
@@ -164,83 +173,92 @@ static inline void volk_32f_convert_64f_generic(double* outputVector, const floa
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void volk_32f_convert_64f_a_avx(double* outputVector, const float* inputVector, unsigned int num_points){
-  unsigned int number = 0;
+static inline void volk_32f_convert_64f_a_avx(double* outputVector,
+                                              const float* inputVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
  
-  const unsigned int quarterPoints = num_points / 4;
+    const unsigned int quarterPoints = num_points / 4;
  
-  const float* inputVectorPtr = (const float*)inputVector;
-  double* outputVectorPtr = outputVector;
-  __m256d ret;
-  __m128 inputVal;
+    const float* inputVectorPtr = (const float*)inputVector;
+    double* outputVectorPtr = outputVector;
+    __m256d ret;
+    __m128 inputVal;
  
-  for(;number < quarterPoints; number++){
-    inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+    for (; number < quarterPoints; number++) {
+        inputVal = _mm_load_ps(inputVectorPtr);
+        inputVectorPtr += 4;
  
-    ret = _mm256_cvtps_pd(inputVal);
-    _mm256_store_pd(outputVectorPtr, ret);
+        ret = _mm256_cvtps_pd(inputVal);
+        _mm256_store_pd(outputVectorPtr, ret);
  
-    outputVectorPtr += 4;
-  }
+        outputVectorPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    outputVector[number] = (double)(inputVector[number]);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        outputVector[number] = (double)(inputVector[number]);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
-  unsigned int number = 0;
+static inline void volk_32f_convert_64f_a_sse2(double* outputVector,
+                                               const float* inputVector,
+                                               unsigned int num_points)
+{
+    unsigned int number = 0;
  
-  const unsigned int quarterPoints = num_points / 4;
+    const unsigned int quarterPoints = num_points / 4;
  
-  const float* inputVectorPtr = (const float*)inputVector;
-  double* outputVectorPtr = outputVector;
-  __m128d ret;
-  __m128 inputVal;
+    const float* inputVectorPtr = (const float*)inputVector;
+    double* outputVectorPtr = outputVector;
+    __m128d ret;
+    __m128 inputVal;
  
-  for(;number < quarterPoints; number++){
-    inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+    for (; number < quarterPoints; number++) {
+        inputVal = _mm_load_ps(inputVectorPtr);
+        inputVectorPtr += 4;
  
-    ret = _mm_cvtps_pd(inputVal);
+        ret = _mm_cvtps_pd(inputVal);
  
-    _mm_store_pd(outputVectorPtr, ret);
-    outputVectorPtr += 2;
+        _mm_store_pd(outputVectorPtr, ret);
+        outputVectorPtr += 2;
  
-    inputVal = _mm_movehl_ps(inputVal, inputVal);
+        inputVal = _mm_movehl_ps(inputVal, inputVal);
  
-    ret = _mm_cvtps_pd(inputVal);
+        ret = _mm_cvtps_pd(inputVal);
  
-    _mm_store_pd(outputVectorPtr, ret);
-    outputVectorPtr += 2;
-  }
+        _mm_store_pd(outputVectorPtr, ret);
+        outputVectorPtr += 2;
+    }
  
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    outputVector[number] = (double)(inputVector[number]);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        outputVector[number] = (double)(inputVector[number]);
+    }
  }
  #endif /* LV_HAVE_SSE2 */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_32f_convert_64f_a_generic(double* outputVector, const float* inputVector, unsigned int num_points){
-  double* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((double)(*inputVectorPtr++));
-  }
+static inline void volk_32f_convert_64f_a_generic(double* outputVector,
+                                                  const float* inputVector,
+                                                  unsigned int num_points)
+{
+    double* outputVectorPtr = outputVector;
+    const float* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
-
-
  #endif /* INCLUDED_volk_32f_convert_64f_a_H */
diff --git a/kernels/volk/volk_32f_cos_32f.h b/kernels/volk/volk_32f_cos_32f.h

index 39c2008e343c7807e6a592e71e9778d29f82e76a..b49376467cc045523451f7214a098f231e1a52c9 100644 (file)
--- a/kernels/volk/volk_32f_cos_32f.h
+++ b/kernels/volk/volk_32f_cos_32f.h
@@ -69,9 +69,9 @@
   * \endcode
   */
  
-#include <stdio.h>
-#include <math.h>
  #include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
  
  #ifndef INCLUDED_volk_32f_cos_32f_a_H
  #define INCLUDED_volk_32f_cos_32f_a_H
@@ -80,86 +80,102 @@
  #include <immintrin.h>
  
  static inline void
- volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int eighthPoints = num_points / 8;
-  unsigned int i = 0;
-
-  __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
-  __m256 sine, cosine;
-  __m256i q, ones, twos, fours;
-
-  m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
-  pio4A = _mm256_set1_ps(0.7853981554508209228515625);
-  pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
-  pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
-  ffours = _mm256_set1_ps(4.0);
-  ftwos = _mm256_set1_ps(2.0);
-  fones = _mm256_set1_ps(1.0);
-  fzeroes = _mm256_setzero_ps();
-  __m256i zeroes = _mm256_set1_epi32(0);
-  ones = _mm256_set1_epi32(1);
-  __m256i allones = _mm256_set1_epi32(0xffffffff);
-  twos = _mm256_set1_epi32(2);
-  fours = _mm256_set1_epi32(4);
-
-  cp1 = _mm256_set1_ps(1.0);
-  cp2 = _mm256_set1_ps(0.08333333333333333);
-  cp3 = _mm256_set1_ps(0.002777777777777778);
-  cp4 = _mm256_set1_ps(4.96031746031746e-05);
-  cp5 = _mm256_set1_ps(5.511463844797178e-07);
-  union bit256 condition1;
-  union bit256 condition3;
-
-  for(;number < eighthPoints; number++){
-
-    aVal = _mm256_load_ps(aPtr);
-    // s = fabs(aVal)
-    s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
-    // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
-    q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
-    // r = q + q&1, q indicates quadrant, r gives
-    r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
-
-    s = _mm256_fnmadd_ps(r,pio4A,s);
-    s = _mm256_fnmadd_ps(r,pio4B,s);
-    s = _mm256_fnmadd_ps(r,pio4C,s);
-
-    s = _mm256_div_ps(s, _mm256_set1_ps(8.0));    // The constant is 2^N, for 3 times argument reduction
-    s = _mm256_mul_ps(s, s);
-    // Evaluate Taylor series
-    s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
-
-    for(i = 0; i < 3; i++)
-      s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
-    s = _mm256_div_ps(s, ftwos);
-
-    sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
-    cosine = _mm256_sub_ps(fones, s);
-
-    // if(((q+1)&2) != 0) { cosine=sine;}
-    condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
-    condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
-
-    // if(((q+2)&4) != 0) { cosine = -cosine;}
-    condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
-    condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
-
-    cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
-    cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3.float_vec));
-    _mm256_store_ps(bPtr, cosine);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *bPtr++ = cos(*aPtr++);
-  }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    unsigned int i = 0;
+
+    __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
+        fones, fzeroes;
+    __m256 sine, cosine;
+    __m256i q, ones, twos, fours;
+
+    m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
+    pio4A = _mm256_set1_ps(0.7853981554508209228515625);
+    pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
+    pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+    ffours = _mm256_set1_ps(4.0);
+    ftwos = _mm256_set1_ps(2.0);
+    fones = _mm256_set1_ps(1.0);
+    fzeroes = _mm256_setzero_ps();
+    __m256i zeroes = _mm256_set1_epi32(0);
+    ones = _mm256_set1_epi32(1);
+    __m256i allones = _mm256_set1_epi32(0xffffffff);
+    twos = _mm256_set1_epi32(2);
+    fours = _mm256_set1_epi32(4);
+
+    cp1 = _mm256_set1_ps(1.0);
+    cp2 = _mm256_set1_ps(0.08333333333333333);
+    cp3 = _mm256_set1_ps(0.002777777777777778);
+    cp4 = _mm256_set1_ps(4.96031746031746e-05);
+    cp5 = _mm256_set1_ps(5.511463844797178e-07);
+    union bit256 condition1;
+    union bit256 condition3;
+
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_load_ps(aPtr);
+        // s = fabs(aVal)
+        s = _mm256_sub_ps(aVal,
+                          _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+                                        _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+        // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+        q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+        // r = q + q&1, q indicates quadrant, r gives
+        r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
+
+        s = _mm256_fnmadd_ps(r, pio4A, s);
+        s = _mm256_fnmadd_ps(r, pio4B, s);
+        s = _mm256_fnmadd_ps(r, pio4C, s);
+
+        s = _mm256_div_ps(
+            s,
+            _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm256_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm256_mul_ps(
+            _mm256_fmadd_ps(
+                _mm256_fmsub_ps(
+                    _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
+                s,
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++)
+            s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+        s = _mm256_div_ps(s, ftwos);
+
+        sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+        cosine = _mm256_sub_ps(fones, s);
+
+        // if(((q+1)&2) != 0) { cosine=sine;}
+        condition1.int_vec =
+            _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
+        condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
+
+        // if(((q+2)&4) != 0) { cosine = -cosine;}
+        condition3.int_vec = _mm256_cmpeq_epi32(
+            _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
+        condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
+
+        cosine = _mm256_add_ps(
+            cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
+        cosine = _mm256_sub_ps(cosine,
+                               _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
+                                             condition3.float_vec));
+        _mm256_store_ps(bPtr, cosine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = cos(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
@@ -168,86 +184,109 @@ static inline void
  #include <immintrin.h>
  
  static inline void
- volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
+volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int eighthPoints = num_points / 8;
-  unsigned int i = 0;
-
-  __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
-  __m256 sine, cosine;
-  __m256i q, ones, twos, fours;
-
-  m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
-  pio4A = _mm256_set1_ps(0.7853981554508209228515625);
-  pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
-  pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
-  ffours = _mm256_set1_ps(4.0);
-  ftwos = _mm256_set1_ps(2.0);
-  fones = _mm256_set1_ps(1.0);
-  fzeroes = _mm256_setzero_ps();
-  __m256i zeroes = _mm256_set1_epi32(0);
-  ones = _mm256_set1_epi32(1);
-  __m256i allones = _mm256_set1_epi32(0xffffffff);
-  twos = _mm256_set1_epi32(2);
-  fours = _mm256_set1_epi32(4);
-
-  cp1 = _mm256_set1_ps(1.0);
-  cp2 = _mm256_set1_ps(0.08333333333333333);
-  cp3 = _mm256_set1_ps(0.002777777777777778);
-  cp4 = _mm256_set1_ps(4.96031746031746e-05);
-  cp5 = _mm256_set1_ps(5.511463844797178e-07);
-  union bit256 condition1;
-  union bit256 condition3;
-
-  for(;number < eighthPoints; number++){
-
-    aVal = _mm256_load_ps(aPtr);
-    // s = fabs(aVal)
-    s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
-    // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
-    q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
-    // r = q + q&1, q indicates quadrant, r gives
-    r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
-
-    s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4A));
-    s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4B));
-    s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4C));
-
-    s = _mm256_div_ps(s, _mm256_set1_ps(8.0));    // The constant is 2^N, for 3 times argument reduction
-    s = _mm256_mul_ps(s, s);
-    // Evaluate Taylor series
-    s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
-    for(i = 0; i < 3; i++)
-      s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
-    s = _mm256_div_ps(s, ftwos);
-
-    sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
-    cosine = _mm256_sub_ps(fones, s);
-
-    // if(((q+1)&2) != 0) { cosine=sine;}
-    condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
-    condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
-
-    // if(((q+2)&4) != 0) { cosine = -cosine;}
-    condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
-    condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
-
-    cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
-    cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3.float_vec));
-    _mm256_store_ps(bPtr, cosine);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *bPtr++ = cos(*aPtr++);
-  }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    unsigned int i = 0;
+
+    __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
+        fones, fzeroes;
+    __m256 sine, cosine;
+    __m256i q, ones, twos, fours;
+
+    m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
+    pio4A = _mm256_set1_ps(0.7853981554508209228515625);
+    pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
+    pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+    ffours = _mm256_set1_ps(4.0);
+    ftwos = _mm256_set1_ps(2.0);
+    fones = _mm256_set1_ps(1.0);
+    fzeroes = _mm256_setzero_ps();
+    __m256i zeroes = _mm256_set1_epi32(0);
+    ones = _mm256_set1_epi32(1);
+    __m256i allones = _mm256_set1_epi32(0xffffffff);
+    twos = _mm256_set1_epi32(2);
+    fours = _mm256_set1_epi32(4);
+
+    cp1 = _mm256_set1_ps(1.0);
+    cp2 = _mm256_set1_ps(0.08333333333333333);
+    cp3 = _mm256_set1_ps(0.002777777777777778);
+    cp4 = _mm256_set1_ps(4.96031746031746e-05);
+    cp5 = _mm256_set1_ps(5.511463844797178e-07);
+    union bit256 condition1;
+    union bit256 condition3;
+
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_load_ps(aPtr);
+        // s = fabs(aVal)
+        s = _mm256_sub_ps(aVal,
+                          _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+                                        _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+        // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+        q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+        // r = q + q&1, q indicates quadrant, r gives
+        r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
+
+        s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
+        s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
+        s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
+
+        s = _mm256_div_ps(
+            s,
+            _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm256_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm256_mul_ps(
+            _mm256_add_ps(
+                _mm256_mul_ps(
+                    _mm256_sub_ps(
+                        _mm256_mul_ps(
+                            _mm256_add_ps(
+                                _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
+                                              s),
+                                cp3),
+                            s),
+                        cp2),
+                    s),
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++)
+            s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+        s = _mm256_div_ps(s, ftwos);
+
+        sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+        cosine = _mm256_sub_ps(fones, s);
+
+        // if(((q+1)&2) != 0) { cosine=sine;}
+        condition1.int_vec =
+            _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
+        condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
+
+        // if(((q+2)&4) != 0) { cosine = -cosine;}
+        condition3.int_vec = _mm256_cmpeq_epi32(
+            _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
+        condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
+
+        cosine = _mm256_add_ps(
+            cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
+        cosine = _mm256_sub_ps(cosine,
+                               _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
+                                             condition3.float_vec));
+        _mm256_store_ps(bPtr, cosine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = cos(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX2 for aligned */
@@ -256,86 +295,105 @@ static inline void
  #include <smmintrin.h>
  
  static inline void
- volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int quarterPoints = num_points / 4;
-  unsigned int i = 0;
-
-  __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
-  __m128 sine, cosine;
-  __m128i q, ones, twos, fours;
-
-  m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125);
-  pio4A = _mm_set1_ps(0.7853981554508209228515625);
-  pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8);
-  pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
-  ffours = _mm_set1_ps(4.0);
-  ftwos = _mm_set1_ps(2.0);
-  fones = _mm_set1_ps(1.0);
-  fzeroes = _mm_setzero_ps();
-  __m128i zeroes = _mm_set1_epi32(0);
-  ones = _mm_set1_epi32(1);
-  __m128i allones = _mm_set1_epi32(0xffffffff);
-  twos = _mm_set1_epi32(2);
-  fours = _mm_set1_epi32(4);
-
-  cp1 = _mm_set1_ps(1.0);
-  cp2 = _mm_set1_ps(0.08333333333333333);
-  cp3 = _mm_set1_ps(0.002777777777777778);
-  cp4 = _mm_set1_ps(4.96031746031746e-05);
-  cp5 = _mm_set1_ps(5.511463844797178e-07);
-  union bit128 condition1;
-  union bit128 condition3;
-
-  for(;number < quarterPoints; number++){
-
-    aVal = _mm_load_ps(aPtr);
-    // s = fabs(aVal)
-    s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
-    // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
-    q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
-    // r = q + q&1, q indicates quadrant, r gives
-    r = _mm_cvtepi32_ps(_mm_add_epi32(q, _mm_and_si128(q, ones)));
-
-    s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A));
-    s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B));
-    s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C));
-
-    s = _mm_div_ps(s, _mm_set1_ps(8.0));    // The constant is 2^N, for 3 times argument reduction
-    s = _mm_mul_ps(s, s);
-    // Evaluate Taylor series
-    s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
-    for(i = 0; i < 3; i++)
-      s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
-    s = _mm_div_ps(s, ftwos);
-
-    sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
-    cosine = _mm_sub_ps(fones, s);
-
-    // if(((q+1)&2) != 0) { cosine=sine;}
-    condition1.int_vec = _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes);
-    condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec);
-
-    // if(((q+2)&4) != 0) { cosine = -cosine;}
-    condition3.int_vec = _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes);
-    condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec);
-
-    cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec));
-    cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3.float_vec));
-    _mm_store_ps(bPtr, cosine);
-    aPtr += 4;
-    bPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *bPtr++ = cosf(*aPtr++);
-  }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    unsigned int i = 0;
+
+    __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
+        fones, fzeroes;
+    __m128 sine, cosine;
+    __m128i q, ones, twos, fours;
+
+    m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125);
+    pio4A = _mm_set1_ps(0.7853981554508209228515625);
+    pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8);
+    pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+    ffours = _mm_set1_ps(4.0);
+    ftwos = _mm_set1_ps(2.0);
+    fones = _mm_set1_ps(1.0);
+    fzeroes = _mm_setzero_ps();
+    __m128i zeroes = _mm_set1_epi32(0);
+    ones = _mm_set1_epi32(1);
+    __m128i allones = _mm_set1_epi32(0xffffffff);
+    twos = _mm_set1_epi32(2);
+    fours = _mm_set1_epi32(4);
+
+    cp1 = _mm_set1_ps(1.0);
+    cp2 = _mm_set1_ps(0.08333333333333333);
+    cp3 = _mm_set1_ps(0.002777777777777778);
+    cp4 = _mm_set1_ps(4.96031746031746e-05);
+    cp5 = _mm_set1_ps(5.511463844797178e-07);
+    union bit128 condition1;
+    union bit128 condition3;
+
+    for (; number < quarterPoints; number++) {
+
+        aVal = _mm_load_ps(aPtr);
+        // s = fabs(aVal)
+        s = _mm_sub_ps(aVal,
+                       _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+        // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+        q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+        // r = q + q&1, q indicates quadrant, r gives
+        r = _mm_cvtepi32_ps(_mm_add_epi32(q, _mm_and_si128(q, ones)));
+
+        s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A));
+        s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B));
+        s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C));
+
+        s = _mm_div_ps(
+            s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm_mul_ps(
+            _mm_add_ps(
+                _mm_mul_ps(
+                    _mm_sub_ps(
+                        _mm_mul_ps(
+                            _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
+                                       cp3),
+                            s),
+                        cp2),
+                    s),
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++)
+            s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+        s = _mm_div_ps(s, ftwos);
+
+        sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+        cosine = _mm_sub_ps(fones, s);
+
+        // if(((q+1)&2) != 0) { cosine=sine;}
+        condition1.int_vec =
+            _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes);
+        condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec);
+
+        // if(((q+2)&4) != 0) { cosine = -cosine;}
+        condition3.int_vec =
+            _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes);
+        condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec);
+
+        cosine = _mm_add_ps(cosine,
+                            _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec));
+        cosine = _mm_sub_ps(
+            cosine,
+            _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3.float_vec));
+        _mm_store_ps(bPtr, cosine);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = cosf(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_SSE4_1 for aligned */
@@ -343,7 +401,6 @@ static inline void
  #endif /* INCLUDED_volk_32f_cos_32f_a_H */
  
  
-
  #ifndef INCLUDED_volk_32f_cos_32f_u_H
  #define INCLUDED_volk_32f_cos_32f_u_H
  
@@ -351,86 +408,102 @@ static inline void
  #include <immintrin.h>
  
  static inline void
- volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int eighthPoints = num_points / 8;
-  unsigned int i = 0;
-
-  __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
-  __m256 sine, cosine;
-  __m256i q, ones, twos, fours;
-
-  m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
-  pio4A = _mm256_set1_ps(0.7853981554508209228515625);
-  pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
-  pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
-  ffours = _mm256_set1_ps(4.0);
-  ftwos = _mm256_set1_ps(2.0);
-  fones = _mm256_set1_ps(1.0);
-  fzeroes = _mm256_setzero_ps();
-  __m256i zeroes = _mm256_set1_epi32(0);
-  ones = _mm256_set1_epi32(1);
-  __m256i allones = _mm256_set1_epi32(0xffffffff);
-  twos = _mm256_set1_epi32(2);
-  fours = _mm256_set1_epi32(4);
-
-  cp1 = _mm256_set1_ps(1.0);
-  cp2 = _mm256_set1_ps(0.08333333333333333);
-  cp3 = _mm256_set1_ps(0.002777777777777778);
-  cp4 = _mm256_set1_ps(4.96031746031746e-05);
-  cp5 = _mm256_set1_ps(5.511463844797178e-07);
-  union bit256 condition1;
-  union bit256 condition3;
-
-  for(;number < eighthPoints; number++){
-
-    aVal = _mm256_loadu_ps(aPtr);
-    // s = fabs(aVal)
-    s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
-    // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
-    q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
-    // r = q + q&1, q indicates quadrant, r gives
-    r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
-
-    s = _mm256_fnmadd_ps(r,pio4A,s);
-    s = _mm256_fnmadd_ps(r,pio4B,s);
-    s = _mm256_fnmadd_ps(r,pio4C,s);
-
-    s = _mm256_div_ps(s, _mm256_set1_ps(8.0));    // The constant is 2^N, for 3 times argument reduction
-    s = _mm256_mul_ps(s, s);
-    // Evaluate Taylor series
-    s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
-
-    for(i = 0; i < 3; i++)
-      s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
-    s = _mm256_div_ps(s, ftwos);
-
-    sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
-    cosine = _mm256_sub_ps(fones, s);
-
-    // if(((q+1)&2) != 0) { cosine=sine;}
-    condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
-    condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
-
-    // if(((q+2)&4) != 0) { cosine = -cosine;}
-    condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
-    condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
-
-    cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
-    cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3.float_vec));
-    _mm256_storeu_ps(bPtr, cosine);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *bPtr++ = cos(*aPtr++);
-  }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    unsigned int i = 0;
+
+    __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
+        fones, fzeroes;
+    __m256 sine, cosine;
+    __m256i q, ones, twos, fours;
+
+    m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
+    pio4A = _mm256_set1_ps(0.7853981554508209228515625);
+    pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
+    pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+    ffours = _mm256_set1_ps(4.0);
+    ftwos = _mm256_set1_ps(2.0);
+    fones = _mm256_set1_ps(1.0);
+    fzeroes = _mm256_setzero_ps();
+    __m256i zeroes = _mm256_set1_epi32(0);
+    ones = _mm256_set1_epi32(1);
+    __m256i allones = _mm256_set1_epi32(0xffffffff);
+    twos = _mm256_set1_epi32(2);
+    fours = _mm256_set1_epi32(4);
+
+    cp1 = _mm256_set1_ps(1.0);
+    cp2 = _mm256_set1_ps(0.08333333333333333);
+    cp3 = _mm256_set1_ps(0.002777777777777778);
+    cp4 = _mm256_set1_ps(4.96031746031746e-05);
+    cp5 = _mm256_set1_ps(5.511463844797178e-07);
+    union bit256 condition1;
+    union bit256 condition3;
+
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_loadu_ps(aPtr);
+        // s = fabs(aVal)
+        s = _mm256_sub_ps(aVal,
+                          _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+                                        _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+        // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+        q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+        // r = q + q&1, q indicates quadrant, r gives
+        r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
+
+        s = _mm256_fnmadd_ps(r, pio4A, s);
+        s = _mm256_fnmadd_ps(r, pio4B, s);
+        s = _mm256_fnmadd_ps(r, pio4C, s);
+
+        s = _mm256_div_ps(
+            s,
+            _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm256_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm256_mul_ps(
+            _mm256_fmadd_ps(
+                _mm256_fmsub_ps(
+                    _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
+                s,
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++)
+            s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+        s = _mm256_div_ps(s, ftwos);
+
+        sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+        cosine = _mm256_sub_ps(fones, s);
+
+        // if(((q+1)&2) != 0) { cosine=sine;}
+        condition1.int_vec =
+            _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
+        condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
+
+        // if(((q+2)&4) != 0) { cosine = -cosine;}
+        condition3.int_vec = _mm256_cmpeq_epi32(
+            _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
+        condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
+
+        cosine = _mm256_add_ps(
+            cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
+        cosine = _mm256_sub_ps(cosine,
+                               _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
+                                             condition3.float_vec));
+        _mm256_storeu_ps(bPtr, cosine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = cos(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
@@ -439,86 +512,109 @@ static inline void
  #include <immintrin.h>
  
  static inline void
- volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
+volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int eighthPoints = num_points / 8;
-  unsigned int i = 0;
-
-  __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
-  __m256 sine, cosine;
-  __m256i q, ones, twos, fours;
-
-  m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
-  pio4A = _mm256_set1_ps(0.7853981554508209228515625);
-  pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
-  pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
-  ffours = _mm256_set1_ps(4.0);
-  ftwos = _mm256_set1_ps(2.0);
-  fones = _mm256_set1_ps(1.0);
-  fzeroes = _mm256_setzero_ps();
-  __m256i zeroes = _mm256_set1_epi32(0);
-  ones = _mm256_set1_epi32(1);
-  __m256i allones = _mm256_set1_epi32(0xffffffff);
-  twos = _mm256_set1_epi32(2);
-  fours = _mm256_set1_epi32(4);
-
-  cp1 = _mm256_set1_ps(1.0);
-  cp2 = _mm256_set1_ps(0.08333333333333333);
-  cp3 = _mm256_set1_ps(0.002777777777777778);
-  cp4 = _mm256_set1_ps(4.96031746031746e-05);
-  cp5 = _mm256_set1_ps(5.511463844797178e-07);
-  union bit256 condition1;
-  union bit256 condition3;
-
-  for(;number < eighthPoints; number++){
-
-    aVal = _mm256_loadu_ps(aPtr);
-    // s = fabs(aVal)
-    s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
-    // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
-    q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
-    // r = q + q&1, q indicates quadrant, r gives
-    r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
-
-    s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4A));
-    s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4B));
-    s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4C));
-
-    s = _mm256_div_ps(s, _mm256_set1_ps(8.0));    // The constant is 2^N, for 3 times argument reduction
-    s = _mm256_mul_ps(s, s);
-    // Evaluate Taylor series
-    s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
-    for(i = 0; i < 3; i++)
-      s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
-    s = _mm256_div_ps(s, ftwos);
-
-    sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
-    cosine = _mm256_sub_ps(fones, s);
-
-    // if(((q+1)&2) != 0) { cosine=sine;}
-    condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
-    condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
-
-    // if(((q+2)&4) != 0) { cosine = -cosine;}
-    condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
-    condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
-
-    cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
-    cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3.float_vec));
-    _mm256_storeu_ps(bPtr, cosine);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *bPtr++ = cos(*aPtr++);
-  }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    unsigned int i = 0;
+
+    __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
+        fones, fzeroes;
+    __m256 sine, cosine;
+    __m256i q, ones, twos, fours;
+
+    m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
+    pio4A = _mm256_set1_ps(0.7853981554508209228515625);
+    pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
+    pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+    ffours = _mm256_set1_ps(4.0);
+    ftwos = _mm256_set1_ps(2.0);
+    fones = _mm256_set1_ps(1.0);
+    fzeroes = _mm256_setzero_ps();
+    __m256i zeroes = _mm256_set1_epi32(0);
+    ones = _mm256_set1_epi32(1);
+    __m256i allones = _mm256_set1_epi32(0xffffffff);
+    twos = _mm256_set1_epi32(2);
+    fours = _mm256_set1_epi32(4);
+
+    cp1 = _mm256_set1_ps(1.0);
+    cp2 = _mm256_set1_ps(0.08333333333333333);
+    cp3 = _mm256_set1_ps(0.002777777777777778);
+    cp4 = _mm256_set1_ps(4.96031746031746e-05);
+    cp5 = _mm256_set1_ps(5.511463844797178e-07);
+    union bit256 condition1;
+    union bit256 condition3;
+
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_loadu_ps(aPtr);
+        // s = fabs(aVal)
+        s = _mm256_sub_ps(aVal,
+                          _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+                                        _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+        // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+        q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+        // r = q + q&1, q indicates quadrant, r gives
+        r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
+
+        s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
+        s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
+        s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
+
+        s = _mm256_div_ps(
+            s,
+            _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm256_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm256_mul_ps(
+            _mm256_add_ps(
+                _mm256_mul_ps(
+                    _mm256_sub_ps(
+                        _mm256_mul_ps(
+                            _mm256_add_ps(
+                                _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
+                                              s),
+                                cp3),
+                            s),
+                        cp2),
+                    s),
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++)
+            s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+        s = _mm256_div_ps(s, ftwos);
+
+        sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+        cosine = _mm256_sub_ps(fones, s);
+
+        // if(((q+1)&2) != 0) { cosine=sine;}
+        condition1.int_vec =
+            _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
+        condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
+
+        // if(((q+2)&4) != 0) { cosine = -cosine;}
+        condition3.int_vec = _mm256_cmpeq_epi32(
+            _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
+        condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
+
+        cosine = _mm256_add_ps(
+            cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
+        cosine = _mm256_sub_ps(cosine,
+                               _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
+                                             condition3.float_vec));
+        _mm256_storeu_ps(bPtr, cosine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = cos(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX2 for unaligned */
@@ -529,71 +625,88 @@ static inline void
  static inline void
  volk_32f_cos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int quarterPoints = num_points / 4;
-  unsigned int i = 0;
-
-  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
-  __m128 sine, cosine, condition1, condition3;
-  __m128i q, r, ones, twos, fours;
-
-  m4pi = _mm_set1_ps(1.273239545);
-  pio4A = _mm_set1_ps(0.78515625);
-  pio4B = _mm_set1_ps(0.241876e-3);
-  ffours = _mm_set1_ps(4.0);
-  ftwos = _mm_set1_ps(2.0);
-  fones = _mm_set1_ps(1.0);
-  fzeroes = _mm_setzero_ps();
-  ones = _mm_set1_epi32(1);
-  twos = _mm_set1_epi32(2);
-  fours = _mm_set1_epi32(4);
-
-  cp1 = _mm_set1_ps(1.0);
-  cp2 = _mm_set1_ps(0.83333333e-1);
-  cp3 = _mm_set1_ps(0.2777778e-2);
-  cp4 = _mm_set1_ps(0.49603e-4);
-  cp5 = _mm_set1_ps(0.551e-6);
-
-  for(;number < quarterPoints; number++){
-    aVal = _mm_loadu_ps(aPtr);
-    s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
-    q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
-    r = _mm_add_epi32(q, _mm_and_si128(q, ones));
-
-    s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
-    s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
-
-    s = _mm_div_ps(s, _mm_set1_ps(8.0));    // The constant is 2^N, for 3 times argument reduction
-    s = _mm_mul_ps(s, s);
-    // Evaluate Taylor series
-    s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
-    for(i = 0; i < 3; i++){
-      s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
-    }
-    s = _mm_div_ps(s, ftwos);
-
-    sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
-    cosine = _mm_sub_ps(fones, s);
-
-    condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
  
-    condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
-
-    cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
-    cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
-    _mm_storeu_ps(bPtr, cosine);
-    aPtr += 4;
-    bPtr += 4;
-  }
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    unsigned int i = 0;
+
+    __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m128 sine, cosine, condition1, condition3;
+    __m128i q, r, ones, twos, fours;
+
+    m4pi = _mm_set1_ps(1.273239545);
+    pio4A = _mm_set1_ps(0.78515625);
+    pio4B = _mm_set1_ps(0.241876e-3);
+    ffours = _mm_set1_ps(4.0);
+    ftwos = _mm_set1_ps(2.0);
+    fones = _mm_set1_ps(1.0);
+    fzeroes = _mm_setzero_ps();
+    ones = _mm_set1_epi32(1);
+    twos = _mm_set1_epi32(2);
+    fours = _mm_set1_epi32(4);
+
+    cp1 = _mm_set1_ps(1.0);
+    cp2 = _mm_set1_ps(0.83333333e-1);
+    cp3 = _mm_set1_ps(0.2777778e-2);
+    cp4 = _mm_set1_ps(0.49603e-4);
+    cp5 = _mm_set1_ps(0.551e-6);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_loadu_ps(aPtr);
+        s = _mm_sub_ps(aVal,
+                       _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+        q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+        r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+        s = _mm_div_ps(
+            s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm_mul_ps(
+            _mm_add_ps(
+                _mm_mul_ps(
+                    _mm_sub_ps(
+                        _mm_mul_ps(
+                            _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
+                                       cp3),
+                            s),
+                        cp2),
+                    s),
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+        }
+        s = _mm_div_ps(s, ftwos);
+
+        sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+        cosine = _mm_sub_ps(fones, s);
+
+        condition1 = _mm_cmpneq_ps(
+            _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+
+        condition3 = _mm_cmpneq_ps(
+            _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
+
+        cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
+        cosine = _mm_sub_ps(
+            cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
+        _mm_storeu_ps(bPtr, cosine);
+        aPtr += 4;
+        bPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *bPtr++ = cosf(*aPtr++);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = cosf(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_SSE4_1 for unaligned */
@@ -606,52 +719,55 @@ volk_32f_cos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num
   * Shibata, Naoki, "Efficient evaluation methods of elementary functions
   * suitable for SIMD computation," in Springer-Verlag 2010
   */
-static inline void
-volk_32f_cos_32f_generic_fast(float* bVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_cos_32f_generic_fast(float* bVector,
+                                                 const float* aVector,
+                                                 unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  float m4pi = 1.273239544735162542821171882678754627704620361328125;
-  float pio4A = 0.7853981554508209228515625;
-  float pio4B = 0.794662735614792836713604629039764404296875e-8;
-  float pio4C = 0.306161699786838294306516483068750264552437361480769e-16;
-  int N = 3; // order of argument reduction
-
-  unsigned int number;
-  for(number = 0; number < num_points; number++){
-      float s = fabs(*aPtr);
-      int q = (int)(s * m4pi);
-      int r = q + (q&1);
-      s -= r * pio4A;
-      s -= r * pio4B;
-      s -= r * pio4C;
-
-      s = s * 0.125; // 2^-N (<--3)
-      s = s*s;
-      s = ((((s/1814400. - 1.0/20160.0)*s + 1.0/360.0)*s - 1.0/12.0)*s + 1.0)*s;
-
-      int i;
-      for(i=0; i < N; ++i) {
-          s = (4.0-s)*s;
-      }
-      s = s/2.0;
-
-      float sine = sqrt((2.0-s)*s);
-      float cosine = 1-s;
-
-      if (((q+1) & 2) != 0) {
-          s = cosine;
-          cosine = sine;
-          sine = s;
-      }
-      if (((q+2) & 4) != 0) {
-          cosine = -cosine;
-      }
-      *bPtr = cosine;
-      bPtr++;
-      aPtr++;
-  }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    float m4pi = 1.273239544735162542821171882678754627704620361328125;
+    float pio4A = 0.7853981554508209228515625;
+    float pio4B = 0.794662735614792836713604629039764404296875e-8;
+    float pio4C = 0.306161699786838294306516483068750264552437361480769e-16;
+    int N = 3; // order of argument reduction
+
+    unsigned int number;
+    for (number = 0; number < num_points; number++) {
+        float s = fabs(*aPtr);
+        int q = (int)(s * m4pi);
+        int r = q + (q & 1);
+        s -= r * pio4A;
+        s -= r * pio4B;
+        s -= r * pio4C;
+
+        s = s * 0.125; // 2^-N (<--3)
+        s = s * s;
+        s = ((((s / 1814400. - 1.0 / 20160.0) * s + 1.0 / 360.0) * s - 1.0 / 12.0) * s +
+             1.0) *
+            s;
+
+        int i;
+        for (i = 0; i < N; ++i) {
+            s = (4.0 - s) * s;
+        }
+        s = s / 2.0;
+
+        float sine = sqrt((2.0 - s) * s);
+        float cosine = 1 - s;
+
+        if (((q + 1) & 2) != 0) {
+            s = cosine;
+            cosine = sine;
+            sine = s;
+        }
+        if (((q + 2) & 4) != 0) {
+            cosine = -cosine;
+        }
+        *bPtr = cosine;
+        bPtr++;
+        aPtr++;
+    }
  }
  
  #endif /* LV_HAVE_GENERIC */
@@ -662,13 +778,13 @@ volk_32f_cos_32f_generic_fast(float* bVector, const float* aVector, unsigned int
  static inline void
  volk_32f_cos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
  
-  for(; number < num_points; number++){
-    *bPtr++ = cosf(*aPtr++);
-  }
+    for (; number < num_points; number++) {
+        *bPtr++ = cosf(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_GENERIC */
@@ -679,30 +795,29 @@ volk_32f_cos_32f_generic(float* bVector, const float* aVector, unsigned int num_
  #include <volk/volk_neon_intrinsics.h>
  
  static inline void
-volk_32f_cos_32f_neon(float* bVector, const float* aVector,
-                      unsigned int num_points)
+volk_32f_cos_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
  {
      unsigned int number = 0;
      unsigned int quarter_points = num_points / 4;
      float* bVectorPtr = bVector;
      const float* aVectorPtr = aVector;
-    
+
      float32x4_t b_vec;
      float32x4_t a_vec;
-    
-    for(number = 0; number < quarter_points; number++) {
+
+    for (number = 0; number < quarter_points; number++) {
          a_vec = vld1q_f32(aVectorPtr);
          // Prefetch next one, speeds things up
-        __VOLK_PREFETCH(aVectorPtr+4);
+        __VOLK_PREFETCH(aVectorPtr + 4);
          b_vec = _vcosq_f32(a_vec);
          vst1q_f32(bVectorPtr, b_vec);
          // move pointers ahead
-        bVectorPtr+=4;
-        aVectorPtr+=4;
+        bVectorPtr += 4;
+        aVectorPtr += 4;
      }
-    
+
      // Deal with the rest
-    for(number = quarter_points * 4; number < num_points; number++) {
+    for (number = quarter_points * 4; number < num_points; number++) {
          *bVectorPtr++ = cosf(*aVectorPtr++);
      }
  }
diff --git a/kernels/volk/volk_32f_expfast_32f.h b/kernels/volk/volk_32f_expfast_32f.h

index ecb4914665e20a49b239245f29b0370a660d7ec8..45de3f9081155f119c794ae9dfa4cf125849fd80 100644 (file)
--- a/kernels/volk/volk_32f_expfast_32f.h
+++ b/kernels/volk/volk_32f_expfast_32f.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_expfast_32f(float* bVector, const float* aVector, unsigned int num_points)
- * \endcode
+ * void volk_32f_expfast_32f(float* bVector, const float* aVector, unsigned int
+ * num_points) \endcode
   *
   * \b Inputs
   * \li aVector: Input vector of floats.
@@ -62,9 +62,9 @@
   * \endcode
   */
  
-#include <stdio.h>
-#include <math.h>
  #include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
  
  #define Mln2 0.6931471805f
  #define A 8388608.0f
@@ -79,34 +79,35 @@
  
  #include <immintrin.h>
  
-static inline void
- volk_32f_expfast_32f_a_avx_fma(float* bVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_expfast_32f_a_avx_fma(float* bVector,
+                                                  const float* aVector,
+                                                  unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  __m256 aVal, bVal, a, b;
-  __m256i exp;
-  a = _mm256_set1_ps(A/Mln2);
-  b = _mm256_set1_ps(B-C);
-
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_load_ps(aPtr);
-    exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a,aVal, b));
-    bVal = _mm256_castsi256_ps(exp);
-
-    _mm256_store_ps(bPtr, bVal);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *bPtr++ = expf(*aPtr++);
-  }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 aVal, bVal, a, b;
+    __m256i exp;
+    a = _mm256_set1_ps(A / Mln2);
+    b = _mm256_set1_ps(B - C);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
+        bVal = _mm256_castsi256_ps(exp);
+
+        _mm256_store_ps(bPtr, bVal);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = expf(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned */
@@ -116,33 +117,33 @@ static inline void
  #include <immintrin.h>
  
  static inline void
- volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
+volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  __m256 aVal, bVal, a, b;
-  __m256i exp;
-  a = _mm256_set1_ps(A/Mln2);
-  b = _mm256_set1_ps(B-C);
-
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_load_ps(aPtr);
-    exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b));
-    bVal = _mm256_castsi256_ps(exp);
-
-    _mm256_store_ps(bPtr, bVal);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *bPtr++ = expf(*aPtr++);
-  }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 aVal, bVal, a, b;
+    __m256i exp;
+    a = _mm256_set1_ps(A / Mln2);
+    b = _mm256_set1_ps(B - C);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
+        bVal = _mm256_castsi256_ps(exp);
+
+        _mm256_store_ps(bPtr, bVal);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = expf(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX for aligned */
@@ -150,34 +151,35 @@ static inline void
  #ifdef LV_HAVE_SSE4_1
  #include <smmintrin.h>
  
-static inline void
-volk_32f_expfast_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_expfast_32f_a_sse4_1(float* bVector,
+                                                 const float* aVector,
+                                                 unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  __m128 aVal, bVal, a, b;
-  __m128i exp;
-  a = _mm_set1_ps(A/Mln2);
-  b = _mm_set1_ps(B-C);
-
-  for(;number < quarterPoints; number++){
-    aVal = _mm_load_ps(aPtr);
-    exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b));
-    bVal = _mm_castsi128_ps(exp);
-
-    _mm_store_ps(bPtr, bVal);
-    aPtr += 4;
-    bPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *bPtr++ = expf(*aPtr++);
-  }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m128 aVal, bVal, a, b;
+    __m128i exp;
+    a = _mm_set1_ps(A / Mln2);
+    b = _mm_set1_ps(B - C);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
+        bVal = _mm_castsi128_ps(exp);
+
+        _mm_store_ps(bPtr, bVal);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = expf(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_SSE4_1 for aligned */
@@ -190,34 +192,35 @@ volk_32f_expfast_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int
  #if LV_HAVE_AVX && LV_HAVE_FMA
  #include <immintrin.h>
  
-static inline void
-volk_32f_expfast_32f_u_avx_fma(float* bVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_expfast_32f_u_avx_fma(float* bVector,
+                                                  const float* aVector,
+                                                  unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  __m256 aVal, bVal, a, b;
-  __m256i exp;
-  a = _mm256_set1_ps(A/Mln2);
-  b = _mm256_set1_ps(B-C);
-
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_loadu_ps(aPtr);
-    exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a,aVal, b));
-    bVal = _mm256_castsi256_ps(exp);
-
-    _mm256_storeu_ps(bPtr, bVal);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *bPtr++ = expf(*aPtr++);
-  }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 aVal, bVal, a, b;
+    __m256i exp;
+    a = _mm256_set1_ps(A / Mln2);
+    b = _mm256_set1_ps(B - C);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
+        bVal = _mm256_castsi256_ps(exp);
+
+        _mm256_storeu_ps(bPtr, bVal);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = expf(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX && LV_HAVE_FMA for unaligned */
@@ -228,31 +231,31 @@ volk_32f_expfast_32f_u_avx_fma(float* bVector, const float* aVector, unsigned in
  static inline void
  volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  __m256 aVal, bVal, a, b;
-  __m256i exp;
-  a = _mm256_set1_ps(A/Mln2);
-  b = _mm256_set1_ps(B-C);
-
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_loadu_ps(aPtr);
-    exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b));
-    bVal = _mm256_castsi256_ps(exp);
-
-    _mm256_storeu_ps(bPtr, bVal);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *bPtr++ = expf(*aPtr++);
-  }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 aVal, bVal, a, b;
+    __m256i exp;
+    a = _mm256_set1_ps(A / Mln2);
+    b = _mm256_set1_ps(B - C);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
+        bVal = _mm256_castsi256_ps(exp);
+
+        _mm256_storeu_ps(bPtr, bVal);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = expf(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX for unaligned */
@@ -261,34 +264,35 @@ volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int nu
  #ifdef LV_HAVE_SSE4_1
  #include <smmintrin.h>
  
-static inline void
-volk_32f_expfast_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_expfast_32f_u_sse4_1(float* bVector,
+                                                 const float* aVector,
+                                                 unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  __m128 aVal, bVal, a, b;
-  __m128i exp;
-  a = _mm_set1_ps(A/Mln2);
-  b = _mm_set1_ps(B-C);
-
-  for(;number < quarterPoints; number++){
-    aVal = _mm_loadu_ps(aPtr);
-    exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b));
-    bVal = _mm_castsi128_ps(exp);
-
-    _mm_storeu_ps(bPtr, bVal);
-    aPtr += 4;
-    bPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *bPtr++ = expf(*aPtr++);
-  }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m128 aVal, bVal, a, b;
+    __m128i exp;
+    a = _mm_set1_ps(A / Mln2);
+    b = _mm_set1_ps(B - C);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_loadu_ps(aPtr);
+        exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
+        bVal = _mm_castsi128_ps(exp);
+
+        _mm_storeu_ps(bPtr, bVal);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = expf(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_SSE4_1 for unaligned */
@@ -296,16 +300,17 @@ volk_32f_expfast_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_expfast_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_expfast_32f_generic(float* bVector,
+                                                const float* aVector,
+                                                unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
  
-  for(number = 0; number < num_points; number++){
-    *bPtr++ = expf(*aPtr++);
-  }
+    for (number = 0; number < num_points; number++) {
+        *bPtr++ = expf(*aPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
diff --git a/kernels/volk/volk_32f_index_max_16u.h b/kernels/volk/volk_32f_index_max_16u.h

index 7ca692838ececf25edd56813cc570368d4d95b38..3ee10f4a66c5aea1a0d84d731f8079edceccd33c 100644 (file)
--- a/kernels/volk/volk_32f_index_max_16u.h
+++ b/kernels/volk/volk_32f_index_max_16u.h
@@ -71,72 +71,71 @@
  #ifndef INCLUDED_volk_32f_index_max_16u_a_H
  #define INCLUDED_volk_32f_index_max_16u_a_H
  
-#include <volk/volk_common.h>
-#include <volk/volk_common.h>
  #include <inttypes.h>
  #include <limits.h>
  #include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
  static inline void
-volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0,
-                             uint32_t num_points)
+volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_points)
  {
-  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
-
-  uint32_t number = 0;
-  const uint32_t eighthPoints = num_points / 8;
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
  
-  float* inputPtr = (float*)src0;
+    uint32_t number = 0;
+    const uint32_t eighthPoints = num_points / 8;
  
-  __m256 indexIncrementValues = _mm256_set1_ps(8);
-  __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
+    float* inputPtr = (float*)src0;
  
-  float max = src0[0];
-  float index = 0;
-  __m256 maxValues = _mm256_set1_ps(max);
-  __m256 maxValuesIndex = _mm256_setzero_ps();
-  __m256 compareResults;
-  __m256 currentValues;
+    __m256 indexIncrementValues = _mm256_set1_ps(8);
+    __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
  
-  __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
-  __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
+    float max = src0[0];
+    float index = 0;
+    __m256 maxValues = _mm256_set1_ps(max);
+    __m256 maxValuesIndex = _mm256_setzero_ps();
+    __m256 compareResults;
+    __m256 currentValues;
  
-  for(;number < eighthPoints; number++){
+    __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
+    __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
  
-    currentValues  = _mm256_load_ps(inputPtr); inputPtr += 8;
-    currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+    for (; number < eighthPoints; number++) {
  
-    compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
+        currentValues = _mm256_load_ps(inputPtr);
+        inputPtr += 8;
+        currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
  
-    maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
-    maxValues      = _mm256_blendv_ps(maxValues, currentValues, compareResults);
-  }
+        compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
  
-  // Calculate the largest value from the remaining 4 points
-  _mm256_store_ps(maxValuesBuffer, maxValues);
-  _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
+        maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+        maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
+    }
  
-  for(number = 0; number < 8; number++){
-    if(maxValuesBuffer[number] > max){
-      index = maxIndexesBuffer[number];
-      max = maxValuesBuffer[number];
-    } else if(maxValuesBuffer[number] == max){
-      if (index > maxIndexesBuffer[number])
-        index = maxIndexesBuffer[number];
+    // Calculate the largest value from the remaining 4 points
+    _mm256_store_ps(maxValuesBuffer, maxValues);
+    _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+    for (number = 0; number < 8; number++) {
+        if (maxValuesBuffer[number] > max) {
+            index = maxIndexesBuffer[number];
+            max = maxValuesBuffer[number];
+        } else if (maxValuesBuffer[number] == max) {
+            if (index > maxIndexesBuffer[number])
+                index = maxIndexesBuffer[number];
+        }
      }
-  }
  
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    if(src0[number] > max){
-      index = number;
-      max = src0[number];
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        if (src0[number] > max) {
+            index = number;
+            max = src0[number];
+        }
      }
-  }
-  target[0] = (uint16_t)index;
+    target[0] = (uint16_t)index;
  }
  
  #endif /*LV_HAVE_AVX*/
@@ -145,62 +144,62 @@ volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0,
  #include <smmintrin.h>
  
  static inline void
-volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0,
-                                uint32_t num_points)
+volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t num_points)
  {
-  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
-
-  uint32_t number = 0;
-  const uint32_t quarterPoints = num_points / 4;
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
  
-  float* inputPtr = (float*)src0;
+    uint32_t number = 0;
+    const uint32_t quarterPoints = num_points / 4;
  
-  __m128 indexIncrementValues = _mm_set1_ps(4);
-  __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+    float* inputPtr = (float*)src0;
  
-  float max = src0[0];
-  float index = 0;
-  __m128 maxValues = _mm_set1_ps(max);
-  __m128 maxValuesIndex = _mm_setzero_ps();
-  __m128 compareResults;
-  __m128 currentValues;
+    __m128 indexIncrementValues = _mm_set1_ps(4);
+    __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
  
-  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
-  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+    float max = src0[0];
+    float index = 0;
+    __m128 maxValues = _mm_set1_ps(max);
+    __m128 maxValuesIndex = _mm_setzero_ps();
+    __m128 compareResults;
+    __m128 currentValues;
  
-  for(;number < quarterPoints; number++){
+    __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+    __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
  
-    currentValues  = _mm_load_ps(inputPtr); inputPtr += 4;
-    currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+    for (; number < quarterPoints; number++) {
  
-    compareResults = _mm_cmpgt_ps(currentValues, maxValues);
+        currentValues = _mm_load_ps(inputPtr);
+        inputPtr += 4;
+        currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
  
-    maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
-    maxValues      = _mm_blendv_ps(maxValues, currentValues, compareResults);
-  }
+        compareResults = _mm_cmpgt_ps(currentValues, maxValues);
  
-  // Calculate the largest value from the remaining 4 points
-  _mm_store_ps(maxValuesBuffer, maxValues);
-  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+        maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+        maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
+    }
  
-  for(number = 0; number < 4; number++){
-    if(maxValuesBuffer[number] > max){
-      index = maxIndexesBuffer[number];
-      max = maxValuesBuffer[number];
-    } else if(maxValuesBuffer[number] == max){
-      if (index > maxIndexesBuffer[number])
-        index = maxIndexesBuffer[number];
+    // Calculate the largest value from the remaining 4 points
+    _mm_store_ps(maxValuesBuffer, maxValues);
+    _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+    for (number = 0; number < 4; number++) {
+        if (maxValuesBuffer[number] > max) {
+            index = maxIndexesBuffer[number];
+            max = maxValuesBuffer[number];
+        } else if (maxValuesBuffer[number] == max) {
+            if (index > maxIndexesBuffer[number])
+                index = maxIndexesBuffer[number];
+        }
      }
-  }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    if(src0[number] > max){
-      index = number;
-      max = src0[number];
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        if (src0[number] > max) {
+            index = number;
+            max = src0[number];
+        }
      }
-  }
-  target[0] = (uint16_t)index;
+    target[0] = (uint16_t)index;
  }
  
  #endif /*LV_HAVE_SSE4_1*/
@@ -211,64 +210,64 @@ volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0,
  #include <xmmintrin.h>
  
  static inline void
-volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0,
-                             uint32_t num_points)
+volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_points)
  {
-  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
-
-  uint32_t number = 0;
-  const uint32_t quarterPoints = num_points / 4;
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
  
-  float* inputPtr = (float*)src0;
+    uint32_t number = 0;
+    const uint32_t quarterPoints = num_points / 4;
  
-  __m128 indexIncrementValues = _mm_set1_ps(4);
-  __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+    float* inputPtr = (float*)src0;
  
-  float max = src0[0];
-  float index = 0;
-  __m128 maxValues = _mm_set1_ps(max);
-  __m128 maxValuesIndex = _mm_setzero_ps();
-  __m128 compareResults;
-  __m128 currentValues;
+    __m128 indexIncrementValues = _mm_set1_ps(4);
+    __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
  
-  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
-  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+    float max = src0[0];
+    float index = 0;
+    __m128 maxValues = _mm_set1_ps(max);
+    __m128 maxValuesIndex = _mm_setzero_ps();
+    __m128 compareResults;
+    __m128 currentValues;
  
-  for(;number < quarterPoints; number++){
+    __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+    __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
  
-    currentValues  = _mm_load_ps(inputPtr); inputPtr += 4;
-    currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+    for (; number < quarterPoints; number++) {
  
-    compareResults = _mm_cmpgt_ps(currentValues, maxValues);
+        currentValues = _mm_load_ps(inputPtr);
+        inputPtr += 4;
+        currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
  
-    maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
-                               _mm_andnot_ps(compareResults, maxValuesIndex));
-    maxValues      = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
-                               _mm_andnot_ps(compareResults, maxValues));
-  }
+        compareResults = _mm_cmpgt_ps(currentValues, maxValues);
  
-  // Calculate the largest value from the remaining 4 points
-  _mm_store_ps(maxValuesBuffer, maxValues);
-  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+        maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
+                                   _mm_andnot_ps(compareResults, maxValuesIndex));
+        maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
+                              _mm_andnot_ps(compareResults, maxValues));
+    }
  
-  for(number = 0; number < 4; number++){
-    if(maxValuesBuffer[number] > max){
-      index = maxIndexesBuffer[number];
-      max = maxValuesBuffer[number];
-    } else if(maxValuesBuffer[number] == max){
-      if (index > maxIndexesBuffer[number])
-        index = maxIndexesBuffer[number];
+    // Calculate the largest value from the remaining 4 points
+    _mm_store_ps(maxValuesBuffer, maxValues);
+    _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+    for (number = 0; number < 4; number++) {
+        if (maxValuesBuffer[number] > max) {
+            index = maxIndexesBuffer[number];
+            max = maxValuesBuffer[number];
+        } else if (maxValuesBuffer[number] == max) {
+            if (index > maxIndexesBuffer[number])
+                index = maxIndexesBuffer[number];
+        }
      }
-  }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    if(src0[number] > max){
-      index = number;
-      max = src0[number];
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        if (src0[number] > max) {
+            index = number;
+            max = src0[number];
+        }
      }
-  }
-  target[0] = (uint16_t)index;
+    target[0] = (uint16_t)index;
  }
  
  #endif /*LV_HAVE_SSE*/
@@ -277,23 +276,22 @@ volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0,
  #ifdef LV_HAVE_GENERIC
  
  static inline void
-volk_32f_index_max_16u_generic(uint16_t* target, const float* src0,
-                               uint32_t num_points)
+volk_32f_index_max_16u_generic(uint16_t* target, const float* src0, uint32_t num_points)
  {
-  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
  
-  float max = src0[0];
-  uint16_t index = 0;
+    float max = src0[0];
+    uint16_t index = 0;
  
-  uint32_t i = 1;
+    uint32_t i = 1;
  
-  for(; i < num_points; ++i) {
-    if(src0[i] > max) {
-      index = i;
-      max = src0[i];
+    for (; i < num_points; ++i) {
+        if (src0[i] > max) {
+            index = i;
+            max = src0[i];
+        }
      }
-  }
-  target[0] = index;
+    target[0] = index;
  }
  
  #endif /*LV_HAVE_GENERIC*/
@@ -302,76 +300,74 @@ volk_32f_index_max_16u_generic(uint16_t* target, const float* src0,
  #endif /*INCLUDED_volk_32f_index_max_16u_a_H*/
  
  
-
  #ifndef INCLUDED_volk_32f_index_max_16u_u_H
  #define INCLUDED_volk_32f_index_max_16u_u_H
  
-#include <volk/volk_common.h>
-#include <volk/volk_common.h>
  #include <inttypes.h>
  #include <limits.h>
  #include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
  static inline void
-volk_32f_index_max_16u_u_avx(uint16_t* target, const float* src0,
-                                uint32_t num_points)
+volk_32f_index_max_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_points)
  {
-  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
-
-  uint32_t number = 0;
-  const uint32_t eighthPoints = num_points / 8;
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
  
-  float* inputPtr = (float*)src0;
+    uint32_t number = 0;
+    const uint32_t eighthPoints = num_points / 8;
  
-  __m256 indexIncrementValues = _mm256_set1_ps(8);
-  __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
+    float* inputPtr = (float*)src0;
  
-  float max = src0[0];
-  float index = 0;
-  __m256 maxValues = _mm256_set1_ps(max);
-  __m256 maxValuesIndex = _mm256_setzero_ps();
-  __m256 compareResults;
-  __m256 currentValues;
+    __m256 indexIncrementValues = _mm256_set1_ps(8);
+    __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
  
-  __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
-  __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
+    float max = src0[0];
+    float index = 0;
+    __m256 maxValues = _mm256_set1_ps(max);
+    __m256 maxValuesIndex = _mm256_setzero_ps();
+    __m256 compareResults;
+    __m256 currentValues;
  
-  for(;number < eighthPoints; number++){
+    __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
+    __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
  
-    currentValues  = _mm256_loadu_ps(inputPtr); inputPtr += 8;
-    currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+    for (; number < eighthPoints; number++) {
  
-    compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
+        currentValues = _mm256_loadu_ps(inputPtr);
+        inputPtr += 8;
+        currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
  
-    maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
-    maxValues      = _mm256_blendv_ps(maxValues, currentValues, compareResults);
-  }
+        compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
  
-  // Calculate the largest value from the remaining 4 points
-  _mm256_storeu_ps(maxValuesBuffer, maxValues);
-  _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex);
+        maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+        maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
+    }
  
-  for(number = 0; number < 8; number++){
-    if(maxValuesBuffer[number] > max){
-      index = maxIndexesBuffer[number];
-      max = maxValuesBuffer[number];
-    } else if(maxValuesBuffer[number] == max){
-      if (index > maxIndexesBuffer[number])
-        index = maxIndexesBuffer[number];
+    // Calculate the largest value from the remaining 4 points
+    _mm256_storeu_ps(maxValuesBuffer, maxValues);
+    _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex);
+
+    for (number = 0; number < 8; number++) {
+        if (maxValuesBuffer[number] > max) {
+            index = maxIndexesBuffer[number];
+            max = maxValuesBuffer[number];
+        } else if (maxValuesBuffer[number] == max) {
+            if (index > maxIndexesBuffer[number])
+                index = maxIndexesBuffer[number];
+        }
      }
-  }
  
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    if(src0[number] > max){
-      index = number;
-      max = src0[number];
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        if (src0[number] > max) {
+            index = number;
+            max = src0[number];
+        }
      }
-  }
-  target[0] = (uint16_t)index;
+    target[0] = (uint16_t)index;
  }
  
  #endif /*LV_HAVE_AVX*/
diff --git a/kernels/volk/volk_32f_index_max_32u.h b/kernels/volk/volk_32f_index_max_32u.h

index 318c8e4ec06f34713282a7d22ec11164b2c9d4cd..315531d958c63767ca892fb00806081e4f875869 100644 (file)
--- a/kernels/volk/volk_32f_index_max_32u.h
+++ b/kernels/volk/volk_32f_index_max_32u.h
@@ -25,7 +25,8 @@
   *
   * \b Overview
   *
- * Returns Argmax_i x[i]. Finds and returns the index which contains the first maximum value in the given vector.
+ * Returns Argmax_i x[i]. Finds and returns the index which contains the first maximum
+ * value in the given vector.
   *
   * <b>Dispatcher Prototype</b>
   * \code
@@ -64,70 +65,71 @@
  #ifndef INCLUDED_volk_32f_index_max_32u_a_H
  #define INCLUDED_volk_32f_index_max_32u_a_H
  
-#include <volk/volk_common.h>
-#include <volk/volk_common.h>
  #include <inttypes.h>
  #include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_SSE4_1
-#include<smmintrin.h>
+#include <smmintrin.h>
  
  static inline void
  volk_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
  {
-  if(num_points > 0){
-    uint32_t number = 0;
-    const uint32_t quarterPoints = num_points / 4;
+    if (num_points > 0) {
+        uint32_t number = 0;
+        const uint32_t quarterPoints = num_points / 4;
  
-    float* inputPtr = (float*)src0;
+        float* inputPtr = (float*)src0;
  
-    __m128 indexIncrementValues = _mm_set1_ps(4);
-    __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+        __m128 indexIncrementValues = _mm_set1_ps(4);
+        __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
  
-    float max = src0[0];
-    float index = 0;
-    __m128 maxValues = _mm_set1_ps(max);
-    __m128 maxValuesIndex = _mm_setzero_ps();
-    __m128 compareResults;
-    __m128 currentValues;
+        float max = src0[0];
+        float index = 0;
+        __m128 maxValues = _mm_set1_ps(max);
+        __m128 maxValuesIndex = _mm_setzero_ps();
+        __m128 compareResults;
+        __m128 currentValues;
  
-    __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
-    __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+        __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+        __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
  
-    for(;number < quarterPoints; number++){
+        for (; number < quarterPoints; number++) {
  
-      currentValues  = _mm_load_ps(inputPtr); inputPtr += 4;
-      currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+            currentValues = _mm_load_ps(inputPtr);
+            inputPtr += 4;
+            currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
  
-      compareResults = _mm_cmpgt_ps(currentValues, maxValues);
+            compareResults = _mm_cmpgt_ps(currentValues, maxValues);
  
-      maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
-      maxValues      = _mm_blendv_ps(maxValues, currentValues, compareResults);
-    }
+            maxValuesIndex =
+                _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+            maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
+        }
  
-    // Calculate the largest value from the remaining 4 points
-    _mm_store_ps(maxValuesBuffer, maxValues);
-    _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
-
-    for(number = 0; number < 4; number++){
-      if(maxValuesBuffer[number] > max){
-       index = maxIndexesBuffer[number];
-       max = maxValuesBuffer[number];
-      } else if(maxValuesBuffer[number] == max){
-        if (index > maxIndexesBuffer[number])
-          index = maxIndexesBuffer[number];
-      }
-    }
+        // Calculate the largest value from the remaining 4 points
+        _mm_store_ps(maxValuesBuffer, maxValues);
+        _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+        for (number = 0; number < 4; number++) {
+            if (maxValuesBuffer[number] > max) {
+                index = maxIndexesBuffer[number];
+                max = maxValuesBuffer[number];
+            } else if (maxValuesBuffer[number] == max) {
+                if (index > maxIndexesBuffer[number])
+                    index = maxIndexesBuffer[number];
+            }
+        }
  
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      if(src0[number] > max){
-       index = number;
-       max = src0[number];
-      }
+        number = quarterPoints * 4;
+        for (; number < num_points; number++) {
+            if (src0[number] > max) {
+                index = number;
+                max = src0[number];
+            }
+        }
+        target[0] = (uint32_t)index;
      }
-    target[0] = (uint32_t)index;
-  }
  }
  
  #endif /*LV_HAVE_SSE4_1*/
@@ -135,67 +137,68 @@ volk_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t nu
  
  #ifdef LV_HAVE_SSE
  
-#include<xmmintrin.h>
+#include <xmmintrin.h>
  
  static inline void
  volk_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points)
  {
-  if(num_points > 0){
-    uint32_t number = 0;
-    const uint32_t quarterPoints = num_points / 4;
+    if (num_points > 0) {
+        uint32_t number = 0;
+        const uint32_t quarterPoints = num_points / 4;
  
-    float* inputPtr = (float*)src0;
+        float* inputPtr = (float*)src0;
  
-    __m128 indexIncrementValues = _mm_set1_ps(4);
-    __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+        __m128 indexIncrementValues = _mm_set1_ps(4);
+        __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
  
-    float max = src0[0];
-    float index = 0;
-    __m128 maxValues = _mm_set1_ps(max);
-    __m128 maxValuesIndex = _mm_setzero_ps();
-    __m128 compareResults;
-    __m128 currentValues;
+        float max = src0[0];
+        float index = 0;
+        __m128 maxValues = _mm_set1_ps(max);
+        __m128 maxValuesIndex = _mm_setzero_ps();
+        __m128 compareResults;
+        __m128 currentValues;
  
-    __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
-    __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+        __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+        __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
  
-    for(;number < quarterPoints; number++){
+        for (; number < quarterPoints; number++) {
  
-      currentValues  = _mm_load_ps(inputPtr); inputPtr += 4;
-      currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+            currentValues = _mm_load_ps(inputPtr);
+            inputPtr += 4;
+            currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
  
-      compareResults = _mm_cmpgt_ps(currentValues, maxValues);
+            compareResults = _mm_cmpgt_ps(currentValues, maxValues);
  
-      maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
-                                 _mm_andnot_ps(compareResults, maxValuesIndex));
+            maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
+                                       _mm_andnot_ps(compareResults, maxValuesIndex));
  
-      maxValues      = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
-                                 _mm_andnot_ps(compareResults, maxValues));
-    }
+            maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
+                                  _mm_andnot_ps(compareResults, maxValues));
+        }
  
-    // Calculate the largest value from the remaining 4 points
-    _mm_store_ps(maxValuesBuffer, maxValues);
-    _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
-
-    for(number = 0; number < 4; number++){
-      if(maxValuesBuffer[number] > max){
-       index = maxIndexesBuffer[number];
-       max = maxValuesBuffer[number];
-      } else if(maxValuesBuffer[number] == max){
-        if (index > maxIndexesBuffer[number])
-          index = maxIndexesBuffer[number];
-      }
-    }
+        // Calculate the largest value from the remaining 4 points
+        _mm_store_ps(maxValuesBuffer, maxValues);
+        _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+        for (number = 0; number < 4; number++) {
+            if (maxValuesBuffer[number] > max) {
+                index = maxIndexesBuffer[number];
+                max = maxValuesBuffer[number];
+            } else if (maxValuesBuffer[number] == max) {
+                if (index > maxIndexesBuffer[number])
+                    index = maxIndexesBuffer[number];
+            }
+        }
  
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      if(src0[number] > max){
-       index = number;
-       max = src0[number];
-      }
+        number = quarterPoints * 4;
+        for (; number < num_points; number++) {
+            if (src0[number] > max) {
+                index = number;
+                max = src0[number];
+            }
+        }
+        target[0] = (uint32_t)index;
      }
-    target[0] = (uint32_t)index;
-  }
  }
  
  #endif /*LV_HAVE_SSE*/
@@ -204,65 +207,61 @@ volk_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_p
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void volk_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points)
+static inline void
+volk_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points)
  {
-    if(num_points > 0)
-        {
-            uint32_t number = 0;
-            const uint32_t quarterPoints = num_points / 8;
-
-            float* inputPtr = (float*)src0;
-
-            __m256 indexIncrementValues = _mm256_set1_ps(8);
-            __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
-
-            float max = src0[0];
-            float index = 0;
-            __m256 maxValues = _mm256_set1_ps(max);
-            __m256 maxValuesIndex = _mm256_setzero_ps();
-            __m256 compareResults;
-            __m256 currentValues;
-
-            __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
-            __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
-
-            for(;number < quarterPoints; number++)
-                {
-                    currentValues  = _mm256_load_ps(inputPtr); inputPtr += 8;
-                    currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
-                    compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
-                    maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
-                    maxValues      = _mm256_blendv_ps(maxValues, currentValues, compareResults);
-                }
-
-            // Calculate the largest value from the remaining 8 points
-            _mm256_store_ps(maxValuesBuffer, maxValues);
-            _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
-
-            for(number = 0; number < 8; number++)
-                {
-                    if(maxValuesBuffer[number] > max)
-                        {
-                            index = maxIndexesBuffer[number];
-                            max = maxValuesBuffer[number];
-                        }
-                    else if(maxValuesBuffer[number] == max){
-                      if (index > maxIndexesBuffer[number])
-                        index = maxIndexesBuffer[number];
-                    }
-                }
-
-            number = quarterPoints * 8;
-            for(;number < num_points; number++)
-                {
-                    if(src0[number] > max)
-                        {
-                            index = number;
-                            max = src0[number];
-                        }
-                }
-            target[0] = (uint32_t)index;
+    if (num_points > 0) {
+        uint32_t number = 0;
+        const uint32_t quarterPoints = num_points / 8;
+
+        float* inputPtr = (float*)src0;
+
+        __m256 indexIncrementValues = _mm256_set1_ps(8);
+        __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
+
+        float max = src0[0];
+        float index = 0;
+        __m256 maxValues = _mm256_set1_ps(max);
+        __m256 maxValuesIndex = _mm256_setzero_ps();
+        __m256 compareResults;
+        __m256 currentValues;
+
+        __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
+        __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
+
+        for (; number < quarterPoints; number++) {
+            currentValues = _mm256_load_ps(inputPtr);
+            inputPtr += 8;
+            currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+            compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
+            maxValuesIndex =
+                _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+            maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
+        }
+
+        // Calculate the largest value from the remaining 8 points
+        _mm256_store_ps(maxValuesBuffer, maxValues);
+        _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+        for (number = 0; number < 8; number++) {
+            if (maxValuesBuffer[number] > max) {
+                index = maxIndexesBuffer[number];
+                max = maxValuesBuffer[number];
+            } else if (maxValuesBuffer[number] == max) {
+                if (index > maxIndexesBuffer[number])
+                    index = maxIndexesBuffer[number];
+            }
+        }
+
+        number = quarterPoints * 8;
+        for (; number < num_points; number++) {
+            if (src0[number] > max) {
+                index = number;
+                max = src0[number];
+            }
          }
+        target[0] = (uint32_t)index;
+    }
  }
  
  #endif /*LV_HAVE_AVX*/
@@ -271,66 +270,63 @@ static inline void volk_32f_index_max_32u_a_avx(uint32_t* target, const float* s
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points)
+static inline void
+volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points)
  {
-    if(num_points > 0)
-        {
-            uint32_t number = 0;
-            const uint32_t quarterPoints = num_points / 4;
-
-            float* inputPtr = (float*)src0;
-            float32x4_t indexIncrementValues = vdupq_n_f32(4);
-            __VOLK_ATTR_ALIGNED(16) float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
-            float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
-
-            float max = src0[0];
-            float index = 0;
-            float32x4_t maxValues = vdupq_n_f32(max);
-            uint32x4_t maxValuesIndex = vmovq_n_u32(0);
-            uint32x4_t compareResults;
-            uint32x4_t currentIndexes_u;
-            float32x4_t currentValues;
-
-            __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
-            __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
-
-            for(;number < quarterPoints; number++)
-                {
-                    currentValues    = vld1q_f32(inputPtr); inputPtr += 4;
-                    currentIndexes   = vaddq_f32(currentIndexes, indexIncrementValues);
-                    currentIndexes_u = vcvtq_u32_f32(currentIndexes);
-                    compareResults   = vcleq_f32(currentValues, maxValues);
-                    maxValuesIndex   = vorrq_u32( vandq_u32( compareResults, maxValuesIndex ), vbicq_u32(currentIndexes_u, compareResults) );
-                    maxValues        = vmaxq_f32(currentValues, maxValues);
-                }
-
-            // Calculate the largest value from the remaining 4 points
-            vst1q_f32(maxValuesBuffer, maxValues);
-            vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex));
-            for(number = 0; number < 4; number++)
-                {
-                    if(maxValuesBuffer[number] > max)
-                        {
-                            index = maxIndexesBuffer[number];
-                            max = maxValuesBuffer[number];
-                        }
-                    else if(maxValues[number] == max){
-                      if (index > maxIndexesBuffer[number])
-                        index = maxIndexesBuffer[number];
-                    }
-                }
-
-            number = quarterPoints * 4;
-            for(;number < num_points; number++)
-                {
-                    if(src0[number] > max)
-                        {
-                            index = number;
-                            max = src0[number];
-                        }
-                }
-            target[0] = (uint32_t)index;
+    if (num_points > 0) {
+        uint32_t number = 0;
+        const uint32_t quarterPoints = num_points / 4;
+
+        float* inputPtr = (float*)src0;
+        float32x4_t indexIncrementValues = vdupq_n_f32(4);
+        __VOLK_ATTR_ALIGNED(16)
+        float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
+        float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
+
+        float max = src0[0];
+        float index = 0;
+        float32x4_t maxValues = vdupq_n_f32(max);
+        uint32x4_t maxValuesIndex = vmovq_n_u32(0);
+        uint32x4_t compareResults;
+        uint32x4_t currentIndexes_u;
+        float32x4_t currentValues;
+
+        __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+        __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+        for (; number < quarterPoints; number++) {
+            currentValues = vld1q_f32(inputPtr);
+            inputPtr += 4;
+            currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
+            currentIndexes_u = vcvtq_u32_f32(currentIndexes);
+            compareResults = vcleq_f32(currentValues, maxValues);
+            maxValuesIndex = vorrq_u32(vandq_u32(compareResults, maxValuesIndex),
+                                       vbicq_u32(currentIndexes_u, compareResults));
+            maxValues = vmaxq_f32(currentValues, maxValues);
+        }
+
+        // Calculate the largest value from the remaining 4 points
+        vst1q_f32(maxValuesBuffer, maxValues);
+        vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex));
+        for (number = 0; number < 4; number++) {
+            if (maxValuesBuffer[number] > max) {
+                index = maxIndexesBuffer[number];
+                max = maxValuesBuffer[number];
+            } else if (maxValues[number] == max) {
+                if (index > maxIndexesBuffer[number])
+                    index = maxIndexesBuffer[number];
+            }
+        }
+
+        number = quarterPoints * 4;
+        for (; number < num_points; number++) {
+            if (src0[number] > max) {
+                index = number;
+                max = src0[number];
+            }
          }
+        target[0] = (uint32_t)index;
+    }
  }
  
  #endif /*LV_HAVE_NEON*/
@@ -341,20 +337,20 @@ static inline void volk_32f_index_max_32u_neon(uint32_t* target, const float* sr
  static inline void
  volk_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points)
  {
-  if(num_points > 0){
-    float max = src0[0];
-    uint32_t index = 0;
+    if (num_points > 0) {
+        float max = src0[0];
+        uint32_t index = 0;
  
-    uint32_t i = 1;
+        uint32_t i = 1;
  
-    for(; i < num_points; ++i) {
-      if(src0[i] > max){
-        index = i;
-        max = src0[i];
-      }
+        for (; i < num_points; ++i) {
+            if (src0[i] > max) {
+                index = i;
+                max = src0[i];
+            }
+        }
+        target[0] = index;
      }
-    target[0] = index;
-  }
  }
  
  #endif /*LV_HAVE_GENERIC*/
@@ -366,209 +362,195 @@ volk_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num
  #ifndef INCLUDED_volk_32f_index_max_32u_u_H
  #define INCLUDED_volk_32f_index_max_32u_u_H
  
-#include <volk/volk_common.h>
-#include <volk/volk_common.h>
  #include <inttypes.h>
  #include <stdio.h>
+#include <volk/volk_common.h>
  
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void volk_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points)
+static inline void
+volk_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points)
  {
-    if(num_points > 0)
-        {
-            uint32_t number = 0;
-            const uint32_t quarterPoints = num_points / 8;
-
-            float* inputPtr = (float*)src0;
-
-            __m256 indexIncrementValues = _mm256_set1_ps(8);
-            __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
-
-            float max = src0[0];
-            float index = 0;
-            __m256 maxValues = _mm256_set1_ps(max);
-            __m256 maxValuesIndex = _mm256_setzero_ps();
-            __m256 compareResults;
-            __m256 currentValues;
-
-            __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
-            __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
-
-            for(;number < quarterPoints; number++)
-                {
-                    currentValues  = _mm256_loadu_ps(inputPtr); inputPtr += 8;
-                    currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
-                    compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
-                    maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
-                    maxValues      = _mm256_blendv_ps(maxValues, currentValues, compareResults);
-                }
-
-            // Calculate the largest value from the remaining 8 points
-            _mm256_store_ps(maxValuesBuffer, maxValues);
-            _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
-
-            for(number = 0; number < 8; number++)
-                {
-                    if(maxValuesBuffer[number] > max)
-                        {
-                            index = maxIndexesBuffer[number];
-                            max = maxValuesBuffer[number];
-                        }
-                    else if(maxValuesBuffer[number] == max){
-                      if (index > maxIndexesBuffer[number])
-                        index = maxIndexesBuffer[number];
-                    }
-                }
-
-            number = quarterPoints * 8;
-            for(;number < num_points; number++)
-                {
-                    if(src0[number] > max)
-                        {
-                            index = number;
-                            max = src0[number];
-                        }
-                }
-            target[0] = (uint32_t)index;
+    if (num_points > 0) {
+        uint32_t number = 0;
+        const uint32_t quarterPoints = num_points / 8;
+
+        float* inputPtr = (float*)src0;
+
+        __m256 indexIncrementValues = _mm256_set1_ps(8);
+        __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
+
+        float max = src0[0];
+        float index = 0;
+        __m256 maxValues = _mm256_set1_ps(max);
+        __m256 maxValuesIndex = _mm256_setzero_ps();
+        __m256 compareResults;
+        __m256 currentValues;
+
+        __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
+        __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
+
+        for (; number < quarterPoints; number++) {
+            currentValues = _mm256_loadu_ps(inputPtr);
+            inputPtr += 8;
+            currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+            compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
+            maxValuesIndex =
+                _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+            maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
          }
+
+        // Calculate the largest value from the remaining 8 points
+        _mm256_store_ps(maxValuesBuffer, maxValues);
+        _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+        for (number = 0; number < 8; number++) {
+            if (maxValuesBuffer[number] > max) {
+                index = maxIndexesBuffer[number];
+                max = maxValuesBuffer[number];
+            } else if (maxValuesBuffer[number] == max) {
+                if (index > maxIndexesBuffer[number])
+                    index = maxIndexesBuffer[number];
+            }
+        }
+
+        number = quarterPoints * 8;
+        for (; number < num_points; number++) {
+            if (src0[number] > max) {
+                index = number;
+                max = src0[number];
+            }
+        }
+        target[0] = (uint32_t)index;
+    }
  }
  
  #endif /*LV_HAVE_AVX*/
  
  
  #ifdef LV_HAVE_SSE4_1
-#include<smmintrin.h>
+#include <smmintrin.h>
  
-static inline void volk_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
+static inline void
+volk_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
  {
-    if(num_points > 0)
-        {
-            uint32_t number = 0;
-            const uint32_t quarterPoints = num_points / 4;
-
-            float* inputPtr = (float*)src0;
-
-            __m128 indexIncrementValues = _mm_set1_ps(4);
-            __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
-
-            float max = src0[0];
-            float index = 0;
-            __m128 maxValues = _mm_set1_ps(max);
-            __m128 maxValuesIndex = _mm_setzero_ps();
-            __m128 compareResults;
-            __m128 currentValues;
-
-            __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
-            __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
-
-            for(;number < quarterPoints; number++)
-                {
-                    currentValues  = _mm_loadu_ps(inputPtr); inputPtr += 4;
-                    currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
-                    compareResults = _mm_cmpgt_ps(currentValues, maxValues);
-                    maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
-                    maxValues      = _mm_blendv_ps(maxValues, currentValues, compareResults);
-                }
-
-            // Calculate the largest value from the remaining 4 points
-            _mm_store_ps(maxValuesBuffer, maxValues);
-            _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
-
-            for(number = 0; number < 4; number++)
-                {
-                    if(maxValuesBuffer[number] > max)
-                        {
-                            index = maxIndexesBuffer[number];
-                            max = maxValuesBuffer[number];
-                        }
-                    else if(maxValuesBuffer[number] == max){
-                      if (index > maxIndexesBuffer[number])
-                        index = maxIndexesBuffer[number];
-                    }
-                }
-
-            number = quarterPoints * 4;
-            for(;number < num_points; number++)
-                {
-                    if(src0[number] > max)
-                        {
-                            index = number;
-                            max = src0[number];
-                        }
-                }
-            target[0] = (uint32_t)index;
+    if (num_points > 0) {
+        uint32_t number = 0;
+        const uint32_t quarterPoints = num_points / 4;
+
+        float* inputPtr = (float*)src0;
+
+        __m128 indexIncrementValues = _mm_set1_ps(4);
+        __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+        float max = src0[0];
+        float index = 0;
+        __m128 maxValues = _mm_set1_ps(max);
+        __m128 maxValuesIndex = _mm_setzero_ps();
+        __m128 compareResults;
+        __m128 currentValues;
+
+        __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+        __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+        for (; number < quarterPoints; number++) {
+            currentValues = _mm_loadu_ps(inputPtr);
+            inputPtr += 4;
+            currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+            compareResults = _mm_cmpgt_ps(currentValues, maxValues);
+            maxValuesIndex =
+                _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+            maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
          }
+
+        // Calculate the largest value from the remaining 4 points
+        _mm_store_ps(maxValuesBuffer, maxValues);
+        _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+        for (number = 0; number < 4; number++) {
+            if (maxValuesBuffer[number] > max) {
+                index = maxIndexesBuffer[number];
+                max = maxValuesBuffer[number];
+            } else if (maxValuesBuffer[number] == max) {
+                if (index > maxIndexesBuffer[number])
+                    index = maxIndexesBuffer[number];
+            }
+        }
+
+        number = quarterPoints * 4;
+        for (; number < num_points; number++) {
+            if (src0[number] > max) {
+                index = number;
+                max = src0[number];
+            }
+        }
+        target[0] = (uint32_t)index;
+    }
  }
  
  #endif /*LV_HAVE_SSE4_1*/
  
  #ifdef LV_HAVE_SSE
-#include<xmmintrin.h>
+#include <xmmintrin.h>
  
-static inline void volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points)
+static inline void
+volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points)
  {
-    if(num_points > 0)
-        {
-            uint32_t number = 0;
-            const uint32_t quarterPoints = num_points / 4;
-
-            float* inputPtr = (float*)src0;
-
-            __m128 indexIncrementValues = _mm_set1_ps(4);
-            __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
-
-            float max = src0[0];
-            float index = 0;
-            __m128 maxValues = _mm_set1_ps(max);
-            __m128 maxValuesIndex = _mm_setzero_ps();
-            __m128 compareResults;
-            __m128 currentValues;
-
-            __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
-            __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
-
-            for(;number < quarterPoints; number++)
-                {
-                    currentValues  = _mm_loadu_ps(inputPtr); inputPtr += 4;
-                    currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
-                    compareResults = _mm_cmpgt_ps(currentValues, maxValues);
-                    maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
-                                               _mm_andnot_ps(compareResults, maxValuesIndex));
-                    maxValues      = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
-                                               _mm_andnot_ps(compareResults, maxValues));
-                }
-
-            // Calculate the largest value from the remaining 4 points
-            _mm_store_ps(maxValuesBuffer, maxValues);
-            _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
-
-            for(number = 0; number < 4; number++)
-                {
-                    if(maxValuesBuffer[number] > max)
-                        {
-                            index = maxIndexesBuffer[number];
-                            max = maxValuesBuffer[number];
-                        }
-                    else if(maxValuesBuffer[number] == max){
-                      if (index > maxIndexesBuffer[number])
-                        index = maxIndexesBuffer[number];
-                    }
-                }
-
-            number = quarterPoints * 4;
-            for(;number < num_points; number++)
-                {
-                    if(src0[number] > max)
-                        {
-                            index = number;
-                            max = src0[number];
-                        }
-                }
-            target[0] = (uint32_t)index;
+    if (num_points > 0) {
+        uint32_t number = 0;
+        const uint32_t quarterPoints = num_points / 4;
+
+        float* inputPtr = (float*)src0;
+
+        __m128 indexIncrementValues = _mm_set1_ps(4);
+        __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+        float max = src0[0];
+        float index = 0;
+        __m128 maxValues = _mm_set1_ps(max);
+        __m128 maxValuesIndex = _mm_setzero_ps();
+        __m128 compareResults;
+        __m128 currentValues;
+
+        __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+        __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+        for (; number < quarterPoints; number++) {
+            currentValues = _mm_loadu_ps(inputPtr);
+            inputPtr += 4;
+            currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+            compareResults = _mm_cmpgt_ps(currentValues, maxValues);
+            maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
+                                       _mm_andnot_ps(compareResults, maxValuesIndex));
+            maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
+                                  _mm_andnot_ps(compareResults, maxValues));
          }
+
+        // Calculate the largest value from the remaining 4 points
+        _mm_store_ps(maxValuesBuffer, maxValues);
+        _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+        for (number = 0; number < 4; number++) {
+            if (maxValuesBuffer[number] > max) {
+                index = maxIndexesBuffer[number];
+                max = maxValuesBuffer[number];
+            } else if (maxValuesBuffer[number] == max) {
+                if (index > maxIndexesBuffer[number])
+                    index = maxIndexesBuffer[number];
+            }
+        }
+
+        number = quarterPoints * 4;
+        for (; number < num_points; number++) {
+            if (src0[number] > max) {
+                index = number;
+                max = src0[number];
+            }
+        }
+        target[0] = (uint32_t)index;
+    }
  }
  
  #endif /*LV_HAVE_SSE*/
diff --git a/kernels/volk/volk_32f_invsqrt_32f.h b/kernels/volk/volk_32f_invsqrt_32f.h

index e416321feced72ee75c13296f93d947f16b646c5..e545515936e4f73ea8dcc47ca8f330fcd8d87732 100644 (file)
--- a/kernels/volk/volk_32f_invsqrt_32f.h
+++ b/kernels/volk/volk_32f_invsqrt_32f.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_invsqrt_32f(float* cVector, const float* aVector, unsigned int num_points)
- * \endcode
+ * void volk_32f_invsqrt_32f(float* cVector, const float* aVector, unsigned int
+ * num_points) \endcode
   *
   * \b Inputs
   * \li aVector: the input vector of floats.
@@ -66,27 +66,27 @@
  #define INCLUDED_volk_32f_invsqrt_32f_a_H
  
  #include <inttypes.h>
-#include <stdio.h>
  #include <math.h>
+#include <stdio.h>
  #include <string.h>
  
-static inline float
-Q_rsqrt(float number)
+static inline float Q_rsqrt(float number)
  {
-  float x2;
-  const float threehalfs = 1.5F;
-  union f32_to_i32 {
-    int32_t i;
-    float f;
-  } u;
-
-  x2 = number * 0.5F;
-  u.f = number;
-  u.i = 0x5f3759df - ( u.i >> 1 );                   // what the fuck?
-  u.f = u.f * ( threehalfs - ( x2 * u.f * u.f ) );   // 1st iteration
-  //u.f  = u.f * ( threehalfs - ( x2 * u.f * u.f ) );   // 2nd iteration, this can be removed
-
-  return u.f;
+    float x2;
+    const float threehalfs = 1.5F;
+    union f32_to_i32 {
+        int32_t i;
+        float f;
+    } u;
+
+    x2 = number * 0.5F;
+    u.f = number;
+    u.i = 0x5f3759df - (u.i >> 1);               // what the fuck?
+    u.f = u.f * (threehalfs - (x2 * u.f * u.f)); // 1st iteration
+    // u.f  = u.f * ( threehalfs - ( x2 * u.f * u.f ) );   // 2nd iteration, this can be
+    // removed
+
+    return u.f;
  }
  
  #ifdef LV_HAVE_AVX
@@ -95,24 +95,23 @@ Q_rsqrt(float number)
  static inline void
  volk_32f_invsqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  __m256 aVal, cVal;
-  for (; number < eighthPoints; number++) {
-    aVal = _mm256_load_ps(aPtr);
-    cVal = _mm256_rsqrt_ps(aVal);
-    _mm256_store_ps(cPtr, cVal);
-    aPtr += 8;
-    cPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++)
-    *cPtr++ = Q_rsqrt(*aPtr++);
-
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    __m256 aVal, cVal;
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        cVal = _mm256_rsqrt_ps(aVal);
+        _mm256_store_ps(cPtr, cVal);
+        aPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++)
+        *cPtr++ = Q_rsqrt(*aPtr++);
  }
  #endif /* LV_HAVE_AVX */
  
@@ -123,29 +122,29 @@ volk_32f_invsqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int nu
  static inline void
  volk_32f_invsqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
  
-  __m128 aVal, cVal;
-  for(;number < quarterPoints; number++){
+    __m128 aVal, cVal;
+    for (; number < quarterPoints; number++) {
  
-    aVal = _mm_load_ps(aPtr);
+        aVal = _mm_load_ps(aPtr);
  
-    cVal = _mm_rsqrt_ps(aVal);
+        cVal = _mm_rsqrt_ps(aVal);
  
-    _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++) {
-    *cPtr++ = Q_rsqrt(*aPtr++);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = Q_rsqrt(*aPtr++);
+    }
  }
  #endif /* LV_HAVE_SSE */
  
@@ -156,37 +155,38 @@ volk_32f_invsqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int nu
  static inline void
  volk_32f_invsqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
  {
-  unsigned int number;
-  const unsigned int quarter_points = num_points / 4;
-
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  float32x4_t a_val, c_val;
-  for (number = 0; number < quarter_points; ++number) {
-    a_val = vld1q_f32(aPtr);
-    c_val = vrsqrteq_f32(a_val);
-    vst1q_f32(cPtr, c_val);
-    aPtr += 4;
-    cPtr += 4;
-  }
-
-  for(number=quarter_points * 4;number < num_points; number++)
-    *cPtr++ = Q_rsqrt(*aPtr++);
+    unsigned int number;
+    const unsigned int quarter_points = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    float32x4_t a_val, c_val;
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld1q_f32(aPtr);
+        c_val = vrsqrteq_f32(a_val);
+        vst1q_f32(cPtr, c_val);
+        aPtr += 4;
+        cPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++)
+        *cPtr++ = Q_rsqrt(*aPtr++);
  }
  #endif /* LV_HAVE_NEON */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_invsqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_invsqrt_32f_generic(float* cVector,
+                                                const float* aVector,
+                                                unsigned int num_points)
  {
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
-  for(number = 0; number < num_points; number++) {
-    *cPtr++ = Q_rsqrt(*aPtr++);
-  }
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = Q_rsqrt(*aPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -196,24 +196,23 @@ volk_32f_invsqrt_32f_generic(float* cVector, const float* aVector, unsigned int
  static inline void
  volk_32f_invsqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  __m256 aVal, cVal;
-  for (; number < eighthPoints; number++) {
-    aVal = _mm256_loadu_ps(aPtr);
-    cVal = _mm256_rsqrt_ps(aVal);
-    _mm256_storeu_ps(cPtr, cVal);
-    aPtr += 8;
-    cPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++)
-    *cPtr++ = Q_rsqrt(*aPtr++);
-
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    __m256 aVal, cVal;
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        cVal = _mm256_rsqrt_ps(aVal);
+        _mm256_storeu_ps(cPtr, cVal);
+        aPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++)
+        *cPtr++ = Q_rsqrt(*aPtr++);
  }
  #endif /* LV_HAVE_AVX */
  
diff --git a/kernels/volk/volk_32f_log2_32f.h b/kernels/volk/volk_32f_log2_32f.h

index 740f89d8c27db4c40e8ebfccba5cc2be3ed6c457..47276d40a191ce40537599ed5376f63893f161d9 100644 (file)
--- a/kernels/volk/volk_32f_log2_32f.h
+++ b/kernels/volk/volk_32f_log2_32f.h
@@ -92,17 +92,18 @@
  #ifndef INCLUDED_volk_32f_log2_32f_a_H
  #define INCLUDED_volk_32f_log2_32f_a_H
  
-#include <stdio.h>
-#include <stdlib.h>
  #include <inttypes.h>
  #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
  
  #define LOG_POLY_DEGREE 6
  
  // +-Inf -> +-127.0f in order to match the behaviour of the SIMD kernels
-static inline float log2f_non_ieee(float f) {
-  float const result = log2f(f);
-  return isinf(result) ? copysignf(127.0f, result) : result;
+static inline float log2f_non_ieee(float f)
+{
+    float const result = log2f(f);
+    return isinf(result) ? copysignf(127.0f, result) : result;
  }
  
  #ifdef LV_HAVE_GENERIC
@@ -110,12 +111,12 @@ static inline float log2f_non_ieee(float f) {
  static inline void
  volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
  
-  for(number = 0; number < num_points; number++)
-    *bPtr++ = log2f_non_ieee(*aPtr++);
+    for (number = 0; number < num_points; number++)
+        *bPtr++ = log2f_non_ieee(*aPtr++);
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -123,56 +124,86 @@ volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num
  #include <immintrin.h>
  
  #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
-#define POLY1_FMAAVX2(x, c0, c1) _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
-#define POLY2_FMAAVX2(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
-#define POLY3_FMAAVX2(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
-#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
-#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
-
-static inline void
-volk_32f_log2_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+#define POLY1_FMAAVX2(x, c0, c1) \
+    _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
+#define POLY2_FMAAVX2(x, c0, c1, c2) \
+    _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
+#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
+    _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
+#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
+    _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
+#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
+    _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
+
+static inline void volk_32f_log2_32f_a_avx2_fma(float* bVector,
+                                                const float* aVector,
+                                                unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
  
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  __m256 aVal, bVal, mantissa, frac, leadingOne;
-  __m256i bias, exp;
+    __m256 aVal, bVal, mantissa, frac, leadingOne;
+    __m256i bias, exp;
  
-  for(;number < eighthPoints; number++){
+    for (; number < eighthPoints; number++) {
  
-    aVal = _mm256_load_ps(aPtr);
-    bias = _mm256_set1_epi32(127);
-    leadingOne = _mm256_set1_ps(1.0f);
-    exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
-    bVal = _mm256_cvtepi32_ps(exp);
+        aVal = _mm256_load_ps(aPtr);
+        bias = _mm256_set1_epi32(127);
+        leadingOne = _mm256_set1_ps(1.0f);
+        exp = _mm256_sub_epi32(
+            _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
+                                               _mm256_set1_epi32(0x7f800000)),
+                              23),
+            bias);
+        bVal = _mm256_cvtepi32_ps(exp);
  
-    // Now to extract mantissa
-    frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+        // Now to extract mantissa
+        frac = _mm256_or_ps(
+            leadingOne,
+            _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
  
  #if LOG_POLY_DEGREE == 6
-    mantissa = POLY5_FMAAVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f,  3.1821337e-1f, -3.4436006e-2f);
+        mantissa = POLY5_FMAAVX2(frac,
+                                 3.1157899f,
+                                 -3.3241990f,
+                                 2.5988452f,
+                                 -1.2315303f,
+                                 3.1821337e-1f,
+                                 -3.4436006e-2f);
  #elif LOG_POLY_DEGREE == 5
-    mantissa = POLY4_FMAAVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+        mantissa = POLY4_FMAAVX2(frac,
+                                 2.8882704548164776201f,
+                                 -2.52074962577807006663f,
+                                 1.48116647521213171641f,
+                                 -0.465725644288844778798f,
+                                 0.0596515482674574969533f);
  #elif LOG_POLY_DEGREE == 4
-    mantissa = POLY3_FMAAVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+        mantissa = POLY3_FMAAVX2(frac,
+                                 2.61761038894603480148f,
+                                 -1.75647175389045657003f,
+                                 0.688243882994381274313f,
+                                 -0.107254423828329604454f);
  #elif LOG_POLY_DEGREE == 3
-    mantissa = POLY2_FMAAVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+        mantissa = POLY2_FMAAVX2(frac,
+                                 2.28330284476918490682f,
+                                 -1.04913055217340124191f,
+                                 0.204446009836232697516f);
  #else
  #error
  #endif
  
-    bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
-    _mm256_store_ps(bPtr, bVal);
+        bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
+        _mm256_store_ps(bPtr, bVal);
  
-    aPtr += 8;
-    bPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  volk_32f_log2_32f_generic(bPtr, aPtr, num_points-number);
+    number = eighthPoints * 8;
+    volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
  }
  
  #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
@@ -181,56 +212,86 @@ volk_32f_log2_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int
  #include <immintrin.h>
  
  #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
-#define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
-#define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
-#define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
-#define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
-#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
+#define POLY1_AVX2(x, c0, c1) \
+    _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
+#define POLY2_AVX2(x, c0, c1, c2) \
+    _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
+#define POLY3_AVX2(x, c0, c1, c2, c3) \
+    _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
+#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
+    _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
+#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
+    _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
  
  static inline void
  volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
  
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  __m256 aVal, bVal, mantissa, frac, leadingOne;
-  __m256i bias, exp;
+    __m256 aVal, bVal, mantissa, frac, leadingOne;
+    __m256i bias, exp;
  
-  for(;number < eighthPoints; number++){
+    for (; number < eighthPoints; number++) {
  
-    aVal = _mm256_load_ps(aPtr);
-    bias = _mm256_set1_epi32(127);
-    leadingOne = _mm256_set1_ps(1.0f);
-    exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
-    bVal = _mm256_cvtepi32_ps(exp);
+        aVal = _mm256_load_ps(aPtr);
+        bias = _mm256_set1_epi32(127);
+        leadingOne = _mm256_set1_ps(1.0f);
+        exp = _mm256_sub_epi32(
+            _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
+                                               _mm256_set1_epi32(0x7f800000)),
+                              23),
+            bias);
+        bVal = _mm256_cvtepi32_ps(exp);
  
-    // Now to extract mantissa
-    frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+        // Now to extract mantissa
+        frac = _mm256_or_ps(
+            leadingOne,
+            _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
  
  #if LOG_POLY_DEGREE == 6
-    mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f,  3.1821337e-1f, -3.4436006e-2f);
+        mantissa = POLY5_AVX2(frac,
+                              3.1157899f,
+                              -3.3241990f,
+                              2.5988452f,
+                              -1.2315303f,
+                              3.1821337e-1f,
+                              -3.4436006e-2f);
  #elif LOG_POLY_DEGREE == 5
-    mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+        mantissa = POLY4_AVX2(frac,
+                              2.8882704548164776201f,
+                              -2.52074962577807006663f,
+                              1.48116647521213171641f,
+                              -0.465725644288844778798f,
+                              0.0596515482674574969533f);
  #elif LOG_POLY_DEGREE == 4
-    mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+        mantissa = POLY3_AVX2(frac,
+                              2.61761038894603480148f,
+                              -1.75647175389045657003f,
+                              0.688243882994381274313f,
+                              -0.107254423828329604454f);
  #elif LOG_POLY_DEGREE == 3
-    mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+        mantissa = POLY2_AVX2(frac,
+                              2.28330284476918490682f,
+                              -1.04913055217340124191f,
+                              0.204446009836232697516f);
  #else
  #error
  #endif
  
-    bVal = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
-    _mm256_store_ps(bPtr, bVal);
+        bVal =
+            _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
+        _mm256_store_ps(bPtr, bVal);
  
-    aPtr += 8;
-    bPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  volk_32f_log2_32f_generic(bPtr, aPtr, num_points-number);
+    number = eighthPoints * 8;
+    volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
  }
  
  #endif /* LV_HAVE_AVX2 for aligned */
@@ -241,54 +302,79 @@ volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_
  #define POLY0(x, c0) _mm_set1_ps(c0)
  #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
  #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
-#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
-#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
-#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) \
+    _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) \
+    _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) \
+    _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
  
  static inline void
  volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
  
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  __m128 aVal, bVal, mantissa, frac, leadingOne;
-  __m128i bias, exp;
+    __m128 aVal, bVal, mantissa, frac, leadingOne;
+    __m128i bias, exp;
  
-  for(;number < quarterPoints; number++){
+    for (; number < quarterPoints; number++) {
  
-    aVal = _mm_load_ps(aPtr);
-    bias = _mm_set1_epi32(127);
-    leadingOne = _mm_set1_ps(1.0f);
-    exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
-    bVal = _mm_cvtepi32_ps(exp);
+        aVal = _mm_load_ps(aPtr);
+        bias = _mm_set1_epi32(127);
+        leadingOne = _mm_set1_ps(1.0f);
+        exp = _mm_sub_epi32(
+            _mm_srli_epi32(
+                _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
+            bias);
+        bVal = _mm_cvtepi32_ps(exp);
  
-    // Now to extract mantissa
-    frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+        // Now to extract mantissa
+        frac = _mm_or_ps(leadingOne,
+                         _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
  
  #if LOG_POLY_DEGREE == 6
-    mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f,  3.1821337e-1f, -3.4436006e-2f);
+        mantissa = POLY5(frac,
+                         3.1157899f,
+                         -3.3241990f,
+                         2.5988452f,
+                         -1.2315303f,
+                         3.1821337e-1f,
+                         -3.4436006e-2f);
  #elif LOG_POLY_DEGREE == 5
-    mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+        mantissa = POLY4(frac,
+                         2.8882704548164776201f,
+                         -2.52074962577807006663f,
+                         1.48116647521213171641f,
+                         -0.465725644288844778798f,
+                         0.0596515482674574969533f);
  #elif LOG_POLY_DEGREE == 4
-    mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+        mantissa = POLY3(frac,
+                         2.61761038894603480148f,
+                         -1.75647175389045657003f,
+                         0.688243882994381274313f,
+                         -0.107254423828329604454f);
  #elif LOG_POLY_DEGREE == 3
-    mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+        mantissa = POLY2(frac,
+                         2.28330284476918490682f,
+                         -1.04913055217340124191f,
+                         0.204446009836232697516f);
  #else
  #error
  #endif
  
-    bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
-    _mm_store_ps(bPtr, bVal);
+        bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
+        _mm_store_ps(bPtr, bVal);
  
-    aPtr += 4;
-    bPtr += 4;
-  }
+        aPtr += 4;
+        bPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  volk_32f_log2_32f_generic(bPtr, aPtr, num_points-number);
+    number = quarterPoints * 4;
+    volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
  }
  
  #endif /* LV_HAVE_SSE4_1 for aligned */
@@ -297,91 +383,91 @@ volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu
  #include <arm_neon.h>
  
  /* these macros allow us to embed logs in other kernels */
-#define VLOG2Q_NEON_PREAMBLE()                                  \
-  int32x4_t one = vdupq_n_s32(0x000800000);                     \
-  /* minimax polynomial */                                      \
-  float32x4_t p0 = vdupq_n_f32(-3.0400402727048585);            \
-  float32x4_t p1 = vdupq_n_f32(6.1129631282966113);             \
-  float32x4_t p2 = vdupq_n_f32(-5.3419892024633207);            \
-  float32x4_t p3 = vdupq_n_f32(3.2865287703753912);             \
-  float32x4_t p4 = vdupq_n_f32(-1.2669182593441635);            \
-  float32x4_t p5 = vdupq_n_f32(0.2751487703421256);             \
-  float32x4_t p6 = vdupq_n_f32(-0.0256910888150985);            \
-  int32x4_t exp_mask = vdupq_n_s32(0x7f800000);                 \
-  int32x4_t sig_mask = vdupq_n_s32(0x007fffff);                 \
-  int32x4_t exp_bias = vdupq_n_s32(127);
-
-
-#define VLOG2Q_NEON_F32(log2_approx, aval)                              \
-  int32x4_t exponent_i = vandq_s32(aval, exp_mask);                     \
-  int32x4_t significand_i = vandq_s32(aval, sig_mask);                  \
-  exponent_i = vshrq_n_s32(exponent_i, 23);                             \
-                                                                        \
-  /* extract the exponent and significand                               \
-     we can treat this as fixed point to save ~9% on the                \
-     conversion + float add */                                          \
-  significand_i = vorrq_s32(one, significand_i);                        \
-  float32x4_t significand_f = vcvtq_n_f32_s32(significand_i,23);        \
-  /* debias the exponent and convert to float */                        \
-  exponent_i = vsubq_s32(exponent_i, exp_bias);                         \
-  float32x4_t exponent_f = vcvtq_f32_s32(exponent_i);                   \
-                                                                        \
-  /* put the significand through a polynomial fit of log2(x) [1,2]      \
-     add the result to the exponent */                                  \
-  log2_approx = vaddq_f32(exponent_f, p0); /* p0 */                     \
-  float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */         \
-  log2_approx = vaddq_f32(log2_approx, tmp1);                           \
-  float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \
-  tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */                           \
-  log2_approx = vaddq_f32(log2_approx, tmp1);                           \
-                                                                        \
-  float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */        \
-  tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */                           \
-  log2_approx = vaddq_f32(log2_approx, tmp1);                           \
-  float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */                \
-  tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */                           \
-  log2_approx = vaddq_f32(log2_approx, tmp1);                           \
-  float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */                \
-  tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */                           \
-  log2_approx = vaddq_f32(log2_approx, tmp1);                           \
-  float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */                \
-  tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */                           \
-  log2_approx = vaddq_f32(log2_approx, tmp1);
+#define VLOG2Q_NEON_PREAMBLE()                         \
+    int32x4_t one = vdupq_n_s32(0x000800000);          \
+    /* minimax polynomial */                           \
+    float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
+    float32x4_t p1 = vdupq_n_f32(6.1129631282966113);  \
+    float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
+    float32x4_t p3 = vdupq_n_f32(3.2865287703753912);  \
+    float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
+    float32x4_t p5 = vdupq_n_f32(0.2751487703421256);  \
+    float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
+    int32x4_t exp_mask = vdupq_n_s32(0x7f800000);      \
+    int32x4_t sig_mask = vdupq_n_s32(0x007fffff);      \
+    int32x4_t exp_bias = vdupq_n_s32(127);
+
+
+#define VLOG2Q_NEON_F32(log2_approx, aval)                                      \
+    int32x4_t exponent_i = vandq_s32(aval, exp_mask);                           \
+    int32x4_t significand_i = vandq_s32(aval, sig_mask);                        \
+    exponent_i = vshrq_n_s32(exponent_i, 23);                                   \
+                                                                                \
+    /* extract the exponent and significand                                     \
+       we can treat this as fixed point to save ~9% on the                      \
+       conversion + float add */                                                \
+    significand_i = vorrq_s32(one, significand_i);                              \
+    float32x4_t significand_f = vcvtq_n_f32_s32(significand_i, 23);             \
+    /* debias the exponent and convert to float */                              \
+    exponent_i = vsubq_s32(exponent_i, exp_bias);                               \
+    float32x4_t exponent_f = vcvtq_f32_s32(exponent_i);                         \
+                                                                                \
+    /* put the significand through a polynomial fit of log2(x) [1,2]            \
+       add the result to the exponent */                                        \
+    log2_approx = vaddq_f32(exponent_f, p0);         /* p0 */                   \
+    float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */               \
+    log2_approx = vaddq_f32(log2_approx, tmp1);                                 \
+    float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */      \
+    tmp1 = vmulq_f32(sig_2, p2);                                 /* p2 * x^2 */ \
+    log2_approx = vaddq_f32(log2_approx, tmp1);                                 \
+                                                                                \
+    float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */              \
+    tmp1 = vmulq_f32(sig_3, p3);                         /* p3 * x^3 */         \
+    log2_approx = vaddq_f32(log2_approx, tmp1);                                 \
+    float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */                      \
+    tmp1 = vmulq_f32(sig_4, p4);                 /* p4 * x^4 */                 \
+    log2_approx = vaddq_f32(log2_approx, tmp1);                                 \
+    float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */                      \
+    tmp1 = vmulq_f32(sig_5, p5);                 /* p5 * x^5 */                 \
+    log2_approx = vaddq_f32(log2_approx, tmp1);                                 \
+    float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */                      \
+    tmp1 = vmulq_f32(sig_6, p6);                 /* p6 * x^6 */                 \
+    log2_approx = vaddq_f32(log2_approx, tmp1);
  
  static inline void
  volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-  unsigned int number;
-  const unsigned int quarterPoints = num_points / 4;
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number;
+    const unsigned int quarterPoints = num_points / 4;
  
-  int32x4_t aval;
-  float32x4_t log2_approx;
+    int32x4_t aval;
+    float32x4_t log2_approx;
  
-  VLOG2Q_NEON_PREAMBLE()
-  // lms
-  //p0 = vdupq_n_f32(-1.649132280361871);
-  //p1 = vdupq_n_f32(1.995047138579499);
-  //p2 = vdupq_n_f32(-0.336914839219728);
+    VLOG2Q_NEON_PREAMBLE()
+    // lms
+    // p0 = vdupq_n_f32(-1.649132280361871);
+    // p1 = vdupq_n_f32(1.995047138579499);
+    // p2 = vdupq_n_f32(-0.336914839219728);
  
-  // keep in mind a single precision float is represented as
-  //   (-1)^sign * 2^exp * 1.significand, so the log2 is
-  // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
-  for(number = 0; number < quarterPoints; ++number){
-    // load float in to an int register without conversion
-    aval = vld1q_s32((int*)aPtr);
+    // keep in mind a single precision float is represented as
+    //   (-1)^sign * 2^exp * 1.significand, so the log2 is
+    // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
+    for (number = 0; number < quarterPoints; ++number) {
+        // load float in to an int register without conversion
+        aval = vld1q_s32((int*)aPtr);
  
-    VLOG2Q_NEON_F32(log2_approx, aval)
+        VLOG2Q_NEON_F32(log2_approx, aval)
  
-      vst1q_f32(bPtr, log2_approx);
+        vst1q_f32(bPtr, log2_approx);
  
-    aPtr += 4;
-    bPtr += 4;
-  }
+        aPtr += 4;
+        bPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  volk_32f_log2_32f_generic(bPtr, aPtr, num_points-number);
+    number = quarterPoints * 4;
+    volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
  }
  
  #endif /* LV_HAVE_NEON */
@@ -398,14 +484,14 @@ volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_po
  static inline void
  volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    float const result = log2f(*aPtr++);
-    *bPtr++ = isinf(result) ? -127.0f : result;
-  }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        float const result = log2f(*aPtr++);
+        *bPtr++ = isinf(result) ? -127.0f : result;
+    }
  }
  
  #endif /* LV_HAVE_GENERIC */
@@ -417,54 +503,79 @@ volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int n
  #define POLY0(x, c0) _mm_set1_ps(c0)
  #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
  #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
-#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
-#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
-#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) \
+    _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) \
+    _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) \
+    _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
  
  static inline void
  volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
  
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  __m128 aVal, bVal, mantissa, frac, leadingOne;
-  __m128i bias, exp;
+    __m128 aVal, bVal, mantissa, frac, leadingOne;
+    __m128i bias, exp;
  
-  for(;number < quarterPoints; number++){
+    for (; number < quarterPoints; number++) {
  
-    aVal = _mm_loadu_ps(aPtr);
-    bias = _mm_set1_epi32(127);
-    leadingOne = _mm_set1_ps(1.0f);
-    exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
-    bVal = _mm_cvtepi32_ps(exp);
+        aVal = _mm_loadu_ps(aPtr);
+        bias = _mm_set1_epi32(127);
+        leadingOne = _mm_set1_ps(1.0f);
+        exp = _mm_sub_epi32(
+            _mm_srli_epi32(
+                _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
+            bias);
+        bVal = _mm_cvtepi32_ps(exp);
  
-    // Now to extract mantissa
-    frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+        // Now to extract mantissa
+        frac = _mm_or_ps(leadingOne,
+                         _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
  
  #if LOG_POLY_DEGREE == 6
-    mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f,  3.1821337e-1f, -3.4436006e-2f);
+        mantissa = POLY5(frac,
+                         3.1157899f,
+                         -3.3241990f,
+                         2.5988452f,
+                         -1.2315303f,
+                         3.1821337e-1f,
+                         -3.4436006e-2f);
  #elif LOG_POLY_DEGREE == 5
-    mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+        mantissa = POLY4(frac,
+                         2.8882704548164776201f,
+                         -2.52074962577807006663f,
+                         1.48116647521213171641f,
+                         -0.465725644288844778798f,
+                         0.0596515482674574969533f);
  #elif LOG_POLY_DEGREE == 4
-    mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+        mantissa = POLY3(frac,
+                         2.61761038894603480148f,
+                         -1.75647175389045657003f,
+                         0.688243882994381274313f,
+                         -0.107254423828329604454f);
  #elif LOG_POLY_DEGREE == 3
-    mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+        mantissa = POLY2(frac,
+                         2.28330284476918490682f,
+                         -1.04913055217340124191f,
+                         0.204446009836232697516f);
  #else
  #error
  #endif
  
-    bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
-    _mm_storeu_ps(bPtr, bVal);
+        bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
+        _mm_storeu_ps(bPtr, bVal);
  
-    aPtr += 4;
-    bPtr += 4;
-  }
+        aPtr += 4;
+        bPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points-number);
+    number = quarterPoints * 4;
+    volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
  }
  
  #endif /* LV_HAVE_SSE4_1 for unaligned */
@@ -473,56 +584,86 @@ volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int nu
  #include <immintrin.h>
  
  #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
-#define POLY1_FMAAVX2(x, c0, c1) _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
-#define POLY2_FMAAVX2(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
-#define POLY3_FMAAVX2(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
-#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
-#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
-
-static inline void
-volk_32f_log2_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+#define POLY1_FMAAVX2(x, c0, c1) \
+    _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
+#define POLY2_FMAAVX2(x, c0, c1, c2) \
+    _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
+#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
+    _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
+#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
+    _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
+#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
+    _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
+
+static inline void volk_32f_log2_32f_u_avx2_fma(float* bVector,
+                                                const float* aVector,
+                                                unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
  
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  __m256 aVal, bVal, mantissa, frac, leadingOne;
-  __m256i bias, exp;
+    __m256 aVal, bVal, mantissa, frac, leadingOne;
+    __m256i bias, exp;
  
-  for(;number < eighthPoints; number++){
+    for (; number < eighthPoints; number++) {
  
-    aVal = _mm256_loadu_ps(aPtr);
-    bias = _mm256_set1_epi32(127);
-    leadingOne = _mm256_set1_ps(1.0f);
-    exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
-    bVal = _mm256_cvtepi32_ps(exp);
+        aVal = _mm256_loadu_ps(aPtr);
+        bias = _mm256_set1_epi32(127);
+        leadingOne = _mm256_set1_ps(1.0f);
+        exp = _mm256_sub_epi32(
+            _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
+                                               _mm256_set1_epi32(0x7f800000)),
+                              23),
+            bias);
+        bVal = _mm256_cvtepi32_ps(exp);
  
-    // Now to extract mantissa
-    frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+        // Now to extract mantissa
+        frac = _mm256_or_ps(
+            leadingOne,
+            _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
  
  #if LOG_POLY_DEGREE == 6
-    mantissa = POLY5_FMAAVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f,  3.1821337e-1f, -3.4436006e-2f);
+        mantissa = POLY5_FMAAVX2(frac,
+                                 3.1157899f,
+                                 -3.3241990f,
+                                 2.5988452f,
+                                 -1.2315303f,
+                                 3.1821337e-1f,
+                                 -3.4436006e-2f);
  #elif LOG_POLY_DEGREE == 5
-    mantissa = POLY4_FMAAVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+        mantissa = POLY4_FMAAVX2(frac,
+                                 2.8882704548164776201f,
+                                 -2.52074962577807006663f,
+                                 1.48116647521213171641f,
+                                 -0.465725644288844778798f,
+                                 0.0596515482674574969533f);
  #elif LOG_POLY_DEGREE == 4
-    mantissa = POLY3_FMAAVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+        mantissa = POLY3_FMAAVX2(frac,
+                                 2.61761038894603480148f,
+                                 -1.75647175389045657003f,
+                                 0.688243882994381274313f,
+                                 -0.107254423828329604454f);
  #elif LOG_POLY_DEGREE == 3
-    mantissa = POLY2_FMAAVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+        mantissa = POLY2_FMAAVX2(frac,
+                                 2.28330284476918490682f,
+                                 -1.04913055217340124191f,
+                                 0.204446009836232697516f);
  #else
  #error
  #endif
  
-    bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
-    _mm256_storeu_ps(bPtr, bVal);
+        bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
+        _mm256_storeu_ps(bPtr, bVal);
  
-    aPtr += 8;
-    bPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points-number);
+    number = eighthPoints * 8;
+    volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
  }
  
  #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
@@ -531,56 +672,86 @@ volk_32f_log2_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int
  #include <immintrin.h>
  
  #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
-#define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
-#define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
-#define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
-#define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
-#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
+#define POLY1_AVX2(x, c0, c1) \
+    _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
+#define POLY2_AVX2(x, c0, c1, c2) \
+    _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
+#define POLY3_AVX2(x, c0, c1, c2, c3) \
+    _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
+#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
+    _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
+#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
+    _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
  
  static inline void
  volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
  
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  __m256 aVal, bVal, mantissa, frac, leadingOne;
-  __m256i bias, exp;
+    __m256 aVal, bVal, mantissa, frac, leadingOne;
+    __m256i bias, exp;
  
-  for(;number < eighthPoints; number++){
+    for (; number < eighthPoints; number++) {
  
-    aVal = _mm256_loadu_ps(aPtr);
-    bias = _mm256_set1_epi32(127);
-    leadingOne = _mm256_set1_ps(1.0f);
-    exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
-    bVal = _mm256_cvtepi32_ps(exp);
+        aVal = _mm256_loadu_ps(aPtr);
+        bias = _mm256_set1_epi32(127);
+        leadingOne = _mm256_set1_ps(1.0f);
+        exp = _mm256_sub_epi32(
+            _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
+                                               _mm256_set1_epi32(0x7f800000)),
+                              23),
+            bias);
+        bVal = _mm256_cvtepi32_ps(exp);
  
-    // Now to extract mantissa
-    frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+        // Now to extract mantissa
+        frac = _mm256_or_ps(
+            leadingOne,
+            _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
  
  #if LOG_POLY_DEGREE == 6
-    mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f,  3.1821337e-1f, -3.4436006e-2f);
+        mantissa = POLY5_AVX2(frac,
+                              3.1157899f,
+                              -3.3241990f,
+                              2.5988452f,
+                              -1.2315303f,
+                              3.1821337e-1f,
+                              -3.4436006e-2f);
  #elif LOG_POLY_DEGREE == 5
-    mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+        mantissa = POLY4_AVX2(frac,
+                              2.8882704548164776201f,
+                              -2.52074962577807006663f,
+                              1.48116647521213171641f,
+                              -0.465725644288844778798f,
+                              0.0596515482674574969533f);
  #elif LOG_POLY_DEGREE == 4
-    mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+        mantissa = POLY3_AVX2(frac,
+                              2.61761038894603480148f,
+                              -1.75647175389045657003f,
+                              0.688243882994381274313f,
+                              -0.107254423828329604454f);
  #elif LOG_POLY_DEGREE == 3
-    mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+        mantissa = POLY2_AVX2(frac,
+                              2.28330284476918490682f,
+                              -1.04913055217340124191f,
+                              0.204446009836232697516f);
  #else
  #error
  #endif
  
-    bVal = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
-    _mm256_storeu_ps(bPtr, bVal);
+        bVal =
+            _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
+        _mm256_storeu_ps(bPtr, bVal);
  
-    aPtr += 8;
-    bPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points-number);
+    number = eighthPoints * 8;
+    volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
  }
  
  #endif /* LV_HAVE_AVX2 for unaligned */
diff --git a/kernels/volk/volk_32f_null_32f.h b/kernels/volk/volk_32f_null_32f.h

index 95e8d1ad4978e130b7b026c2b6a3ceca15163d79..cbed229c0ada57630ffdc6650a0fb8670986861a 100644 (file)
--- a/kernels/volk/volk_32f_null_32f.h
+++ b/kernels/volk/volk_32f_null_32f.h
@@ -20,9 +20,9 @@
   * Boston, MA 02110-1301, USA.
   */
  
-#include <stdio.h>
-#include <math.h>
  #include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
  
  #ifndef INCLUDED_volk_32f_null_32f_a_H
  #define INCLUDED_volk_32f_null_32f_a_H
@@ -32,13 +32,13 @@
  static inline void
  volk_32f_null_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-  unsigned int number;
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number;
  
-  for(number = 0; number < num_points; number++){
-    *bPtr++ = *aPtr++;
-  }
+    for (number = 0; number < num_points; number++) {
+        *bPtr++ = *aPtr++;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
diff --git a/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h b/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h

index 987995998891a9f15258e9efdbee7f8f9a38796a..3bf7aea095474f9df2c30c37c6cadcb5d63a2e8a 100644 (file)
--- a/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h
+++ b/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h
@@ -30,14 +30,15 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_s32f_32f_fm_detect_32f(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points)
- * \endcode
+ * void volk_32f_s32f_32f_fm_detect_32f(float* outputVector, const float* inputVector,
+ * const float bound, float* saveValue, unsigned int num_points) \endcode
   *
   * \b Inputs
- * \li inputVector: The input vector containing phase data (must be on the interval (-bound, bound]).
- * \li bound: The interval that the input phase data is in, which is used to modulo the differentiation.
- * \li saveValue: A pointer to a float which contains the phase value of the sample before the first input sample.
- * \li num_points The number of data points.
+ * \li inputVector: The input vector containing phase data (must be on the interval
+ * (-bound, bound]). \li bound: The interval that the input phase data is in, which is
+ * used to modulo the differentiation. \li saveValue: A pointer to a float which contains
+ * the phase value of the sample before the first input sample. \li num_points The number
+ * of data points.
   *
   * \b Outputs
   * \li outputVector: The vector where the results will be stored.
@@ -62,67 +63,79 @@
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void volk_32f_s32f_32f_fm_detect_32f_a_avx(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
-  if (num_points < 1) {
-    return;
-  }
-  unsigned int number = 1;
-  unsigned int j = 0;
-  // num_points-1 keeps Fedora 7's gcc from crashing...
-  // num_points won't work.  :(
-  const unsigned int eighthPoints = (num_points-1) / 8;
-
-  float* outPtr = outputVector;
-  const float* inPtr = inputVector;
-  __m256 upperBound = _mm256_set1_ps(bound);
-  __m256 lowerBound = _mm256_set1_ps(-bound);
-  __m256 next3old1;
-  __m256 next4;
-  __m256 boundAdjust;
-  __m256 posBoundAdjust = _mm256_set1_ps(-2*bound); // Subtract when we're above.
-  __m256 negBoundAdjust = _mm256_set1_ps(2*bound); // Add when we're below.
-  // Do the first 8 by hand since we're going in from the saveValue:
-  *outPtr = *inPtr - *saveValue;
-  if (*outPtr >  bound) *outPtr -= 2*bound;
-  if (*outPtr < -bound) *outPtr += 2*bound;
-  inPtr++;
-  outPtr++;
-  for (j = 1; j < ( (8 < num_points) ? 8 : num_points); j++) {
-    *outPtr = *(inPtr) - *(inPtr-1);
-    if (*outPtr >  bound) *outPtr -= 2*bound;
-    if (*outPtr < -bound) *outPtr += 2*bound;
-    inPtr++;
-    outPtr++;
-  }
-
-  for (; number < eighthPoints; number++) {
-    // Load data
-    next3old1 = _mm256_loadu_ps((float*) (inPtr-1));
-    next4 = _mm256_load_ps(inPtr);
-    inPtr += 8;
-    // Subtract and store:
-    next3old1 = _mm256_sub_ps(next4, next3old1);
-    // Bound:
-    boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS);
-    boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
-    next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS);
-    next4 = _mm256_and_ps(next4, negBoundAdjust);
-    boundAdjust = _mm256_or_ps(next4, boundAdjust);
-    // Make sure we're in the bounding interval:
-    next3old1 = _mm256_add_ps(next3old1, boundAdjust);
-    _mm256_store_ps(outPtr,next3old1); // Store the results back into the output
-    outPtr += 8;
-  }
-
-  for (number = (8 > (eighthPoints*8) ? 8 : (8 * eighthPoints)); number < num_points; number++) {
-    *outPtr = *(inPtr) - *(inPtr-1);
-    if (*outPtr >  bound) *outPtr -= 2*bound;
-    if (*outPtr < -bound) *outPtr += 2*bound;
+static inline void volk_32f_s32f_32f_fm_detect_32f_a_avx(float* outputVector,
+                                                         const float* inputVector,
+                                                         const float bound,
+                                                         float* saveValue,
+                                                         unsigned int num_points)
+{
+    if (num_points < 1) {
+        return;
+    }
+    unsigned int number = 1;
+    unsigned int j = 0;
+    // num_points-1 keeps Fedora 7's gcc from crashing...
+    // num_points won't work.  :(
+    const unsigned int eighthPoints = (num_points - 1) / 8;
+
+    float* outPtr = outputVector;
+    const float* inPtr = inputVector;
+    __m256 upperBound = _mm256_set1_ps(bound);
+    __m256 lowerBound = _mm256_set1_ps(-bound);
+    __m256 next3old1;
+    __m256 next4;
+    __m256 boundAdjust;
+    __m256 posBoundAdjust = _mm256_set1_ps(-2 * bound); // Subtract when we're above.
+    __m256 negBoundAdjust = _mm256_set1_ps(2 * bound);  // Add when we're below.
+    // Do the first 8 by hand since we're going in from the saveValue:
+    *outPtr = *inPtr - *saveValue;
+    if (*outPtr > bound)
+        *outPtr -= 2 * bound;
+    if (*outPtr < -bound)
+        *outPtr += 2 * bound;
      inPtr++;
      outPtr++;
-  }
-
-  *saveValue = inputVector[num_points-1];
+    for (j = 1; j < ((8 < num_points) ? 8 : num_points); j++) {
+        *outPtr = *(inPtr) - *(inPtr - 1);
+        if (*outPtr > bound)
+            *outPtr -= 2 * bound;
+        if (*outPtr < -bound)
+            *outPtr += 2 * bound;
+        inPtr++;
+        outPtr++;
+    }
+
+    for (; number < eighthPoints; number++) {
+        // Load data
+        next3old1 = _mm256_loadu_ps((float*)(inPtr - 1));
+        next4 = _mm256_load_ps(inPtr);
+        inPtr += 8;
+        // Subtract and store:
+        next3old1 = _mm256_sub_ps(next4, next3old1);
+        // Bound:
+        boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS);
+        boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
+        next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS);
+        next4 = _mm256_and_ps(next4, negBoundAdjust);
+        boundAdjust = _mm256_or_ps(next4, boundAdjust);
+        // Make sure we're in the bounding interval:
+        next3old1 = _mm256_add_ps(next3old1, boundAdjust);
+        _mm256_store_ps(outPtr, next3old1); // Store the results back into the output
+        outPtr += 8;
+    }
+
+    for (number = (8 > (eighthPoints * 8) ? 8 : (8 * eighthPoints)); number < num_points;
+         number++) {
+        *outPtr = *(inPtr) - *(inPtr - 1);
+        if (*outPtr > bound)
+            *outPtr -= 2 * bound;
+        if (*outPtr < -bound)
+            *outPtr += 2 * bound;
+        inPtr++;
+        outPtr++;
+    }
+
+    *saveValue = inputVector[num_points - 1];
  }
  #endif /* LV_HAVE_AVX */
  
@@ -130,102 +143,122 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a_avx(float* outputVector, co
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
-  if (num_points < 1) {
-    return;
-  }
-  unsigned int number = 1;
-  unsigned int j = 0;
-  // num_points-1 keeps Fedora 7's gcc from crashing...
-  // num_points won't work.  :(
-  const unsigned int quarterPoints = (num_points-1) / 4;
-
-  float* outPtr = outputVector;
-  const float* inPtr = inputVector;
-  __m128 upperBound = _mm_set_ps1(bound);
-  __m128 lowerBound = _mm_set_ps1(-bound);
-  __m128 next3old1;
-  __m128 next4;
-  __m128 boundAdjust;
-  __m128 posBoundAdjust = _mm_set_ps1(-2*bound); // Subtract when we're above.
-  __m128 negBoundAdjust = _mm_set_ps1(2*bound); // Add when we're below.
-  // Do the first 4 by hand since we're going in from the saveValue:
-  *outPtr = *inPtr - *saveValue;
-  if (*outPtr >  bound) *outPtr -= 2*bound;
-  if (*outPtr < -bound) *outPtr += 2*bound;
-  inPtr++;
-  outPtr++;
-  for (j = 1; j < ( (4 < num_points) ? 4 : num_points); j++) {
-    *outPtr = *(inPtr) - *(inPtr-1);
-    if (*outPtr >  bound) *outPtr -= 2*bound;
-    if (*outPtr < -bound) *outPtr += 2*bound;
-    inPtr++;
-    outPtr++;
-  }
-
-  for (; number < quarterPoints; number++) {
-    // Load data
-    next3old1 = _mm_loadu_ps((float*) (inPtr-1));
-    next4 = _mm_load_ps(inPtr);
-    inPtr += 4;
-    // Subtract and store:
-    next3old1 = _mm_sub_ps(next4, next3old1);
-    // Bound:
-    boundAdjust = _mm_cmpgt_ps(next3old1, upperBound);
-    boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust);
-    next4 = _mm_cmplt_ps(next3old1, lowerBound);
-    next4 = _mm_and_ps(next4, negBoundAdjust);
-    boundAdjust = _mm_or_ps(next4, boundAdjust);
-    // Make sure we're in the bounding interval:
-    next3old1 = _mm_add_ps(next3old1, boundAdjust);
-    _mm_store_ps(outPtr,next3old1); // Store the results back into the output
-    outPtr += 4;
-  }
-
-  for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) {
-    *outPtr = *(inPtr) - *(inPtr-1);
-    if (*outPtr >  bound) *outPtr -= 2*bound;
-    if (*outPtr < -bound) *outPtr += 2*bound;
+static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector,
+                                                         const float* inputVector,
+                                                         const float bound,
+                                                         float* saveValue,
+                                                         unsigned int num_points)
+{
+    if (num_points < 1) {
+        return;
+    }
+    unsigned int number = 1;
+    unsigned int j = 0;
+    // num_points-1 keeps Fedora 7's gcc from crashing...
+    // num_points won't work.  :(
+    const unsigned int quarterPoints = (num_points - 1) / 4;
+
+    float* outPtr = outputVector;
+    const float* inPtr = inputVector;
+    __m128 upperBound = _mm_set_ps1(bound);
+    __m128 lowerBound = _mm_set_ps1(-bound);
+    __m128 next3old1;
+    __m128 next4;
+    __m128 boundAdjust;
+    __m128 posBoundAdjust = _mm_set_ps1(-2 * bound); // Subtract when we're above.
+    __m128 negBoundAdjust = _mm_set_ps1(2 * bound);  // Add when we're below.
+    // Do the first 4 by hand since we're going in from the saveValue:
+    *outPtr = *inPtr - *saveValue;
+    if (*outPtr > bound)
+        *outPtr -= 2 * bound;
+    if (*outPtr < -bound)
+        *outPtr += 2 * bound;
      inPtr++;
      outPtr++;
-  }
-
-  *saveValue = inputVector[num_points-1];
+    for (j = 1; j < ((4 < num_points) ? 4 : num_points); j++) {
+        *outPtr = *(inPtr) - *(inPtr - 1);
+        if (*outPtr > bound)
+            *outPtr -= 2 * bound;
+        if (*outPtr < -bound)
+            *outPtr += 2 * bound;
+        inPtr++;
+        outPtr++;
+    }
+
+    for (; number < quarterPoints; number++) {
+        // Load data
+        next3old1 = _mm_loadu_ps((float*)(inPtr - 1));
+        next4 = _mm_load_ps(inPtr);
+        inPtr += 4;
+        // Subtract and store:
+        next3old1 = _mm_sub_ps(next4, next3old1);
+        // Bound:
+        boundAdjust = _mm_cmpgt_ps(next3old1, upperBound);
+        boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust);
+        next4 = _mm_cmplt_ps(next3old1, lowerBound);
+        next4 = _mm_and_ps(next4, negBoundAdjust);
+        boundAdjust = _mm_or_ps(next4, boundAdjust);
+        // Make sure we're in the bounding interval:
+        next3old1 = _mm_add_ps(next3old1, boundAdjust);
+        _mm_store_ps(outPtr, next3old1); // Store the results back into the output
+        outPtr += 4;
+    }
+
+    for (number = (4 > (quarterPoints * 4) ? 4 : (4 * quarterPoints));
+         number < num_points;
+         number++) {
+        *outPtr = *(inPtr) - *(inPtr - 1);
+        if (*outPtr > bound)
+            *outPtr -= 2 * bound;
+        if (*outPtr < -bound)
+            *outPtr += 2 * bound;
+        inPtr++;
+        outPtr++;
+    }
+
+    *saveValue = inputVector[num_points - 1];
  }
  #endif /* LV_HAVE_SSE */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
-  if (num_points < 1) {
-    return;
-  }
-  unsigned int number = 0;
-  float* outPtr = outputVector;
-  const float* inPtr = inputVector;
-
-  // Do the first 1 by hand since we're going in from the saveValue:
-  *outPtr = *inPtr - *saveValue;
-  if (*outPtr >  bound) *outPtr -= 2*bound;
-  if (*outPtr < -bound) *outPtr += 2*bound;
-  inPtr++;
-  outPtr++;
-
-  for (number = 1; number < num_points; number++) {
-    *outPtr = *(inPtr) - *(inPtr-1);
-    if (*outPtr >  bound) *outPtr -= 2*bound;
-    if (*outPtr < -bound) *outPtr += 2*bound;
+static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector,
+                                                           const float* inputVector,
+                                                           const float bound,
+                                                           float* saveValue,
+                                                           unsigned int num_points)
+{
+    if (num_points < 1) {
+        return;
+    }
+    unsigned int number = 0;
+    float* outPtr = outputVector;
+    const float* inPtr = inputVector;
+
+    // Do the first 1 by hand since we're going in from the saveValue:
+    *outPtr = *inPtr - *saveValue;
+    if (*outPtr > bound)
+        *outPtr -= 2 * bound;
+    if (*outPtr < -bound)
+        *outPtr += 2 * bound;
      inPtr++;
      outPtr++;
-  }
  
-  *saveValue = inputVector[num_points-1];
+    for (number = 1; number < num_points; number++) {
+        *outPtr = *(inPtr) - *(inPtr - 1);
+        if (*outPtr > bound)
+            *outPtr -= 2 * bound;
+        if (*outPtr < -bound)
+            *outPtr += 2 * bound;
+        inPtr++;
+        outPtr++;
+    }
+
+    *saveValue = inputVector[num_points - 1];
  }
  #endif /* LV_HAVE_GENERIC */
  
  
-
-
  #endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H */
  
  
@@ -238,67 +271,79 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector,
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void volk_32f_s32f_32f_fm_detect_32f_u_avx(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
-  if (num_points < 1) {
-    return;
-  }
-  unsigned int number = 1;
-  unsigned int j = 0;
-  // num_points-1 keeps Fedora 7's gcc from crashing...
-  // num_points won't work.  :(
-  const unsigned int eighthPoints = (num_points-1) / 8;
-
-  float* outPtr = outputVector;
-  const float* inPtr = inputVector;
-  __m256 upperBound = _mm256_set1_ps(bound);
-  __m256 lowerBound = _mm256_set1_ps(-bound);
-  __m256 next3old1;
-  __m256 next4;
-  __m256 boundAdjust;
-  __m256 posBoundAdjust = _mm256_set1_ps(-2*bound); // Subtract when we're above.
-  __m256 negBoundAdjust = _mm256_set1_ps(2*bound); // Add when we're below.
-  // Do the first 8 by hand since we're going in from the saveValue:
-  *outPtr = *inPtr - *saveValue;
-  if (*outPtr >  bound) *outPtr -= 2*bound;
-  if (*outPtr < -bound) *outPtr += 2*bound;
-  inPtr++;
-  outPtr++;
-  for (j = 1; j < ( (8 < num_points) ? 8 : num_points); j++) {
-    *outPtr = *(inPtr) - *(inPtr-1);
-    if (*outPtr >  bound) *outPtr -= 2*bound;
-    if (*outPtr < -bound) *outPtr += 2*bound;
+static inline void volk_32f_s32f_32f_fm_detect_32f_u_avx(float* outputVector,
+                                                         const float* inputVector,
+                                                         const float bound,
+                                                         float* saveValue,
+                                                         unsigned int num_points)
+{
+    if (num_points < 1) {
+        return;
+    }
+    unsigned int number = 1;
+    unsigned int j = 0;
+    // num_points-1 keeps Fedora 7's gcc from crashing...
+    // num_points won't work.  :(
+    const unsigned int eighthPoints = (num_points - 1) / 8;
+
+    float* outPtr = outputVector;
+    const float* inPtr = inputVector;
+    __m256 upperBound = _mm256_set1_ps(bound);
+    __m256 lowerBound = _mm256_set1_ps(-bound);
+    __m256 next3old1;
+    __m256 next4;
+    __m256 boundAdjust;
+    __m256 posBoundAdjust = _mm256_set1_ps(-2 * bound); // Subtract when we're above.
+    __m256 negBoundAdjust = _mm256_set1_ps(2 * bound);  // Add when we're below.
+    // Do the first 8 by hand since we're going in from the saveValue:
+    *outPtr = *inPtr - *saveValue;
+    if (*outPtr > bound)
+        *outPtr -= 2 * bound;
+    if (*outPtr < -bound)
+        *outPtr += 2 * bound;
      inPtr++;
      outPtr++;
-  }
-
-  for (; number < eighthPoints; number++) {
-    // Load data
-    next3old1 = _mm256_loadu_ps((float*) (inPtr-1));
-    next4 = _mm256_loadu_ps(inPtr);
-    inPtr += 8;
-    // Subtract and store:
-    next3old1 = _mm256_sub_ps(next4, next3old1);
-    // Bound:
-    boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS);
-    boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
-    next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS);
-    next4 = _mm256_and_ps(next4, negBoundAdjust);
-    boundAdjust = _mm256_or_ps(next4, boundAdjust);
-    // Make sure we're in the bounding interval:
-    next3old1 = _mm256_add_ps(next3old1, boundAdjust);
-    _mm256_storeu_ps(outPtr,next3old1); // Store the results back into the output
-    outPtr += 8;
-  }
-
-  for (number = (8 > (eighthPoints*8) ? 8 : (8 * eighthPoints)); number < num_points; number++) {
-    *outPtr = *(inPtr) - *(inPtr-1);
-    if (*outPtr >  bound) *outPtr -= 2*bound;
-    if (*outPtr < -bound) *outPtr += 2*bound;
-    inPtr++;
-    outPtr++;
-  }
-
-  *saveValue = inputVector[num_points-1];
+    for (j = 1; j < ((8 < num_points) ? 8 : num_points); j++) {
+        *outPtr = *(inPtr) - *(inPtr - 1);
+        if (*outPtr > bound)
+            *outPtr -= 2 * bound;
+        if (*outPtr < -bound)
+            *outPtr += 2 * bound;
+        inPtr++;
+        outPtr++;
+    }
+
+    for (; number < eighthPoints; number++) {
+        // Load data
+        next3old1 = _mm256_loadu_ps((float*)(inPtr - 1));
+        next4 = _mm256_loadu_ps(inPtr);
+        inPtr += 8;
+        // Subtract and store:
+        next3old1 = _mm256_sub_ps(next4, next3old1);
+        // Bound:
+        boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS);
+        boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
+        next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS);
+        next4 = _mm256_and_ps(next4, negBoundAdjust);
+        boundAdjust = _mm256_or_ps(next4, boundAdjust);
+        // Make sure we're in the bounding interval:
+        next3old1 = _mm256_add_ps(next3old1, boundAdjust);
+        _mm256_storeu_ps(outPtr, next3old1); // Store the results back into the output
+        outPtr += 8;
+    }
+
+    for (number = (8 > (eighthPoints * 8) ? 8 : (8 * eighthPoints)); number < num_points;
+         number++) {
+        *outPtr = *(inPtr) - *(inPtr - 1);
+        if (*outPtr > bound)
+            *outPtr -= 2 * bound;
+        if (*outPtr < -bound)
+            *outPtr += 2 * bound;
+        inPtr++;
+        outPtr++;
+    }
+
+    *saveValue = inputVector[num_points - 1];
  }
  #endif /* LV_HAVE_AVX */
  
diff --git a/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h b/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h

index ae371a20828de3e781081973bd6cde17b83f8558..e7e581facbcb9c929031eeff9d17c76b5d40fdd1 100644 (file)
--- a/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h
+++ b/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h
@@ -35,13 +35,15 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_s32f_calc_spectral_noise_floor_32f(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points)
- * \endcode
+ * void volk_32f_s32f_calc_spectral_noise_floor_32f(float* noiseFloorAmplitude, const
+ * float* realDataPoints, const float spectralExclusionValue, const unsigned int
+ * num_points) \endcode
   *
   * \b Inputs
   * \li realDataPoints: The input power spectrum.
- * \li spectralExclusionValue: The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20.
- * \li num_points: The number of data points.
+ * \li spectralExclusionValue: The number of dB above the noise floor that a data point
+ * must be to be excluded from the noise floor calculation - default value is 20. \li
+ * num_points: The number of data points.
   *
   * \b Outputs
   * \li noiseFloorAmplitude: The noise floor of the input spectrum, in dB.
@@ -59,9 +61,9 @@
  #ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H
  #define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
  #include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
@@ -72,114 +74,117 @@ volk_32f_s32f_calc_spectral_noise_floor_32f_a_avx(float* noiseFloorAmplitude,
                                                    const float spectralExclusionValue,
                                                    const unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  const float* dataPointsPtr = realDataPoints;
-  __VOLK_ATTR_ALIGNED(32) float avgPointsVector[8];
-
-  __m256 dataPointsVal;
-  __m256 avgPointsVal = _mm256_setzero_ps();
-  // Calculate the sum (for mean) for all points
-  for(; number < eighthPoints; number++){
-
-    dataPointsVal = _mm256_load_ps(dataPointsPtr);
-
-    dataPointsPtr += 8;
-
-    avgPointsVal = _mm256_add_ps(avgPointsVal, dataPointsVal);
-  }
-
-  _mm256_store_ps(avgPointsVector, avgPointsVal);
-
-  float sumMean = 0.0;
-  sumMean += avgPointsVector[0];
-  sumMean += avgPointsVector[1];
-  sumMean += avgPointsVector[2];
-  sumMean += avgPointsVector[3];
-  sumMean += avgPointsVector[4];
-  sumMean += avgPointsVector[5];
-  sumMean += avgPointsVector[6];
-  sumMean += avgPointsVector[7];
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    sumMean += realDataPoints[number];
-  }
-
-  // calculate the spectral mean
-  // +20 because for the comparison below we only want to throw out bins
-  // that are significantly higher (and would, thus, affect the mean more
-  const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
-
-  dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
-  __m256 vMeanAmplitudeVector = _mm256_set1_ps(meanAmplitude);
-  __m256 vOnesVector = _mm256_set1_ps(1.0);
-  __m256 vValidBinCount = _mm256_setzero_ps();
-  avgPointsVal = _mm256_setzero_ps();
-  __m256 compareMask;
-  number = 0;
-  // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
-  for(; number < eighthPoints; number++){
-
-    dataPointsVal = _mm256_load_ps(dataPointsPtr);
-
-    dataPointsPtr += 8;
-
-    // Identify which items do not exceed the mean amplitude
-    compareMask = _mm256_cmp_ps(dataPointsVal, vMeanAmplitudeVector, _CMP_LE_OQ);
-
-    // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude
-    avgPointsVal = _mm256_add_ps(avgPointsVal, _mm256_and_ps(compareMask, dataPointsVal));
-
-    // Count the number of bins which do not exceed the mean amplitude
-    vValidBinCount = _mm256_add_ps(vValidBinCount, _mm256_and_ps(compareMask, vOnesVector));
-  }
-
-  // Calculate the mean from the remaining data points
-  _mm256_store_ps(avgPointsVector, avgPointsVal);
-
-  sumMean = 0.0;
-  sumMean += avgPointsVector[0];
-  sumMean += avgPointsVector[1];
-  sumMean += avgPointsVector[2];
-  sumMean += avgPointsVector[3];
-  sumMean += avgPointsVector[4];
-  sumMean += avgPointsVector[5];
-  sumMean += avgPointsVector[6];
-  sumMean += avgPointsVector[7];
-
-  // Calculate the number of valid bins from the remaining count
-  __VOLK_ATTR_ALIGNED(32) float validBinCountVector[8];
-  _mm256_store_ps(validBinCountVector, vValidBinCount);
-
-  float validBinCount = 0;
-  validBinCount += validBinCountVector[0];
-  validBinCount += validBinCountVector[1];
-  validBinCount += validBinCountVector[2];
-  validBinCount += validBinCountVector[3];
-  validBinCount += validBinCountVector[4];
-  validBinCount += validBinCountVector[5];
-  validBinCount += validBinCountVector[6];
-  validBinCount += validBinCountVector[7];
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    if(realDataPoints[number] <= meanAmplitude){
-      sumMean += realDataPoints[number];
-      validBinCount += 1.0;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* dataPointsPtr = realDataPoints;
+    __VOLK_ATTR_ALIGNED(32) float avgPointsVector[8];
+
+    __m256 dataPointsVal;
+    __m256 avgPointsVal = _mm256_setzero_ps();
+    // Calculate the sum (for mean) for all points
+    for (; number < eighthPoints; number++) {
+
+        dataPointsVal = _mm256_load_ps(dataPointsPtr);
+
+        dataPointsPtr += 8;
+
+        avgPointsVal = _mm256_add_ps(avgPointsVal, dataPointsVal);
      }
-  }
  
-  float localNoiseFloorAmplitude = 0;
-  if(validBinCount > 0.0){
-    localNoiseFloorAmplitude = sumMean / validBinCount;
-  }
-  else{
-    localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal...
-  }
+    _mm256_store_ps(avgPointsVector, avgPointsVal);
+
+    float sumMean = 0.0;
+    sumMean += avgPointsVector[0];
+    sumMean += avgPointsVector[1];
+    sumMean += avgPointsVector[2];
+    sumMean += avgPointsVector[3];
+    sumMean += avgPointsVector[4];
+    sumMean += avgPointsVector[5];
+    sumMean += avgPointsVector[6];
+    sumMean += avgPointsVector[7];
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        sumMean += realDataPoints[number];
+    }
+
+    // calculate the spectral mean
+    // +20 because for the comparison below we only want to throw out bins
+    // that are significantly higher (and would, thus, affect the mean more
+    const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
+
+    dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
+    __m256 vMeanAmplitudeVector = _mm256_set1_ps(meanAmplitude);
+    __m256 vOnesVector = _mm256_set1_ps(1.0);
+    __m256 vValidBinCount = _mm256_setzero_ps();
+    avgPointsVal = _mm256_setzero_ps();
+    __m256 compareMask;
+    number = 0;
+    // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
+    for (; number < eighthPoints; number++) {
+
+        dataPointsVal = _mm256_load_ps(dataPointsPtr);
+
+        dataPointsPtr += 8;
+
+        // Identify which items do not exceed the mean amplitude
+        compareMask = _mm256_cmp_ps(dataPointsVal, vMeanAmplitudeVector, _CMP_LE_OQ);
+
+        // Mask off the items that exceed the mean amplitude and add the avg Points that
+        // do not exceed the mean amplitude
+        avgPointsVal =
+            _mm256_add_ps(avgPointsVal, _mm256_and_ps(compareMask, dataPointsVal));
+
+        // Count the number of bins which do not exceed the mean amplitude
+        vValidBinCount =
+            _mm256_add_ps(vValidBinCount, _mm256_and_ps(compareMask, vOnesVector));
+    }
  
-  *noiseFloorAmplitude = localNoiseFloorAmplitude;
+    // Calculate the mean from the remaining data points
+    _mm256_store_ps(avgPointsVector, avgPointsVal);
+
+    sumMean = 0.0;
+    sumMean += avgPointsVector[0];
+    sumMean += avgPointsVector[1];
+    sumMean += avgPointsVector[2];
+    sumMean += avgPointsVector[3];
+    sumMean += avgPointsVector[4];
+    sumMean += avgPointsVector[5];
+    sumMean += avgPointsVector[6];
+    sumMean += avgPointsVector[7];
+
+    // Calculate the number of valid bins from the remaining count
+    __VOLK_ATTR_ALIGNED(32) float validBinCountVector[8];
+    _mm256_store_ps(validBinCountVector, vValidBinCount);
+
+    float validBinCount = 0;
+    validBinCount += validBinCountVector[0];
+    validBinCount += validBinCountVector[1];
+    validBinCount += validBinCountVector[2];
+    validBinCount += validBinCountVector[3];
+    validBinCount += validBinCountVector[4];
+    validBinCount += validBinCountVector[5];
+    validBinCount += validBinCountVector[6];
+    validBinCount += validBinCountVector[7];
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        if (realDataPoints[number] <= meanAmplitude) {
+            sumMean += realDataPoints[number];
+            validBinCount += 1.0;
+        }
+    }
+
+    float localNoiseFloorAmplitude = 0;
+    if (validBinCount > 0.0) {
+        localNoiseFloorAmplitude = sumMean / validBinCount;
+    } else {
+        localNoiseFloorAmplitude =
+            meanAmplitude; // For the odd case that all the amplitudes are equal...
+    }
+
+    *noiseFloorAmplitude = localNoiseFloorAmplitude;
  }
  #endif /* LV_HAVE_AVX */
  
@@ -192,102 +197,103 @@ volk_32f_s32f_calc_spectral_noise_floor_32f_a_sse(float* noiseFloorAmplitude,
                                                    const float spectralExclusionValue,
                                                    const unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* dataPointsPtr = realDataPoints;
-  __VOLK_ATTR_ALIGNED(16) float avgPointsVector[4];
-
-  __m128 dataPointsVal;
-  __m128 avgPointsVal = _mm_setzero_ps();
-  // Calculate the sum (for mean) for all points
-  for(; number < quarterPoints; number++){
-
-    dataPointsVal = _mm_load_ps(dataPointsPtr);
-
-    dataPointsPtr += 4;
-
-    avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal);
-  }
-
-  _mm_store_ps(avgPointsVector, avgPointsVal);
-
-  float sumMean = 0.0;
-  sumMean += avgPointsVector[0];
-  sumMean += avgPointsVector[1];
-  sumMean += avgPointsVector[2];
-  sumMean += avgPointsVector[3];
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    sumMean += realDataPoints[number];
-  }
-
-  // calculate the spectral mean
-  // +20 because for the comparison below we only want to throw out bins
-  // that are significantly higher (and would, thus, affect the mean more
-  const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
-
-  dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
-  __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude);
-  __m128 vOnesVector = _mm_set_ps1(1.0);
-  __m128 vValidBinCount = _mm_setzero_ps();
-  avgPointsVal = _mm_setzero_ps();
-  __m128 compareMask;
-  number = 0;
-  // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
-  for(; number < quarterPoints; number++){
-
-    dataPointsVal = _mm_load_ps(dataPointsPtr);
-
-    dataPointsPtr += 4;
-
-    // Identify which items do not exceed the mean amplitude
-    compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector);
-
-    // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude
-    avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal));
-
-    // Count the number of bins which do not exceed the mean amplitude
-    vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector));
-  }
-
-  // Calculate the mean from the remaining data points
-  _mm_store_ps(avgPointsVector, avgPointsVal);
-
-  sumMean = 0.0;
-  sumMean += avgPointsVector[0];
-  sumMean += avgPointsVector[1];
-  sumMean += avgPointsVector[2];
-  sumMean += avgPointsVector[3];
-
-  // Calculate the number of valid bins from the remaining count
-  __VOLK_ATTR_ALIGNED(16) float validBinCountVector[4];
-  _mm_store_ps(validBinCountVector, vValidBinCount);
-
-  float validBinCount = 0;
-  validBinCount += validBinCountVector[0];
-  validBinCount += validBinCountVector[1];
-  validBinCount += validBinCountVector[2];
-  validBinCount += validBinCountVector[3];
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    if(realDataPoints[number] <= meanAmplitude){
-      sumMean += realDataPoints[number];
-      validBinCount += 1.0;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* dataPointsPtr = realDataPoints;
+    __VOLK_ATTR_ALIGNED(16) float avgPointsVector[4];
+
+    __m128 dataPointsVal;
+    __m128 avgPointsVal = _mm_setzero_ps();
+    // Calculate the sum (for mean) for all points
+    for (; number < quarterPoints; number++) {
+
+        dataPointsVal = _mm_load_ps(dataPointsPtr);
+
+        dataPointsPtr += 4;
+
+        avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal);
+    }
+
+    _mm_store_ps(avgPointsVector, avgPointsVal);
+
+    float sumMean = 0.0;
+    sumMean += avgPointsVector[0];
+    sumMean += avgPointsVector[1];
+    sumMean += avgPointsVector[2];
+    sumMean += avgPointsVector[3];
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        sumMean += realDataPoints[number];
+    }
+
+    // calculate the spectral mean
+    // +20 because for the comparison below we only want to throw out bins
+    // that are significantly higher (and would, thus, affect the mean more
+    const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
+
+    dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
+    __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude);
+    __m128 vOnesVector = _mm_set_ps1(1.0);
+    __m128 vValidBinCount = _mm_setzero_ps();
+    avgPointsVal = _mm_setzero_ps();
+    __m128 compareMask;
+    number = 0;
+    // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
+    for (; number < quarterPoints; number++) {
+
+        dataPointsVal = _mm_load_ps(dataPointsPtr);
+
+        dataPointsPtr += 4;
+
+        // Identify which items do not exceed the mean amplitude
+        compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector);
+
+        // Mask off the items that exceed the mean amplitude and add the avg Points that
+        // do not exceed the mean amplitude
+        avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal));
+
+        // Count the number of bins which do not exceed the mean amplitude
+        vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector));
      }
-  }
  
-  float localNoiseFloorAmplitude = 0;
-  if(validBinCount > 0.0){
-    localNoiseFloorAmplitude = sumMean / validBinCount;
-  }
-  else{
-    localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal...
-  }
+    // Calculate the mean from the remaining data points
+    _mm_store_ps(avgPointsVector, avgPointsVal);
+
+    sumMean = 0.0;
+    sumMean += avgPointsVector[0];
+    sumMean += avgPointsVector[1];
+    sumMean += avgPointsVector[2];
+    sumMean += avgPointsVector[3];
+
+    // Calculate the number of valid bins from the remaining count
+    __VOLK_ATTR_ALIGNED(16) float validBinCountVector[4];
+    _mm_store_ps(validBinCountVector, vValidBinCount);
+
+    float validBinCount = 0;
+    validBinCount += validBinCountVector[0];
+    validBinCount += validBinCountVector[1];
+    validBinCount += validBinCountVector[2];
+    validBinCount += validBinCountVector[3];
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        if (realDataPoints[number] <= meanAmplitude) {
+            sumMean += realDataPoints[number];
+            validBinCount += 1.0;
+        }
+    }
+
+    float localNoiseFloorAmplitude = 0;
+    if (validBinCount > 0.0) {
+        localNoiseFloorAmplitude = sumMean / validBinCount;
+    } else {
+        localNoiseFloorAmplitude =
+            meanAmplitude; // For the odd case that all the amplitudes are equal...
+    }
  
-  *noiseFloorAmplitude = localNoiseFloorAmplitude;
+    *noiseFloorAmplitude = localNoiseFloorAmplitude;
  }
  #endif /* LV_HAVE_SSE */
  
@@ -300,36 +306,36 @@ volk_32f_s32f_calc_spectral_noise_floor_32f_generic(float* noiseFloorAmplitude,
                                                      const float spectralExclusionValue,
                                                      const unsigned int num_points)
  {
-  float sumMean = 0.0;
-  unsigned int number;
-  // find the sum (for mean), etc
-  for(number = 0; number < num_points; number++){
-    // sum (for mean)
-    sumMean += realDataPoints[number];
-  }
-
-  // calculate the spectral mean
-  // +20 because for the comparison below we only want to throw out bins
-  // that are significantly higher (and would, thus, affect the mean more)
-  const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue;
-
-  // now throw out any bins higher than the mean
-  sumMean = 0.0;
-  unsigned int newNumDataPoints = num_points;
-  for(number = 0; number < num_points; number++){
-    if (realDataPoints[number] <= meanAmplitude)
-      sumMean += realDataPoints[number];
-    else
-      newNumDataPoints--;
-  }
+    float sumMean = 0.0;
+    unsigned int number;
+    // find the sum (for mean), etc
+    for (number = 0; number < num_points; number++) {
+        // sum (for mean)
+        sumMean += realDataPoints[number];
+    }
+
+    // calculate the spectral mean
+    // +20 because for the comparison below we only want to throw out bins
+    // that are significantly higher (and would, thus, affect the mean more)
+    const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue;
+
+    // now throw out any bins higher than the mean
+    sumMean = 0.0;
+    unsigned int newNumDataPoints = num_points;
+    for (number = 0; number < num_points; number++) {
+        if (realDataPoints[number] <= meanAmplitude)
+            sumMean += realDataPoints[number];
+        else
+            newNumDataPoints--;
+    }
  
-  float localNoiseFloorAmplitude = 0.0;
-  if (newNumDataPoints == 0)             // in the odd case that all
-    localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal!
-  else
-    localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints);
+    float localNoiseFloorAmplitude = 0.0;
+    if (newNumDataPoints == 0)                    // in the odd case that all
+        localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal!
+    else
+        localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints);
  
-  *noiseFloorAmplitude = localNoiseFloorAmplitude;
+    *noiseFloorAmplitude = localNoiseFloorAmplitude;
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -339,9 +345,9 @@ volk_32f_s32f_calc_spectral_noise_floor_32f_generic(float* noiseFloorAmplitude,
  #ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_u_H
  #define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_u_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
  #include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
@@ -352,114 +358,117 @@ volk_32f_s32f_calc_spectral_noise_floor_32f_u_avx(float* noiseFloorAmplitude,
                                                    const float spectralExclusionValue,
                                                    const unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  const float* dataPointsPtr = realDataPoints;
-  __VOLK_ATTR_ALIGNED(16) float avgPointsVector[8];
-
-  __m256 dataPointsVal;
-  __m256 avgPointsVal = _mm256_setzero_ps();
-  // Calculate the sum (for mean) for all points
-  for(; number < eighthPoints; number++){
-
-    dataPointsVal = _mm256_loadu_ps(dataPointsPtr);
-
-    dataPointsPtr += 8;
-
-    avgPointsVal = _mm256_add_ps(avgPointsVal, dataPointsVal);
-  }
-
-  _mm256_storeu_ps(avgPointsVector, avgPointsVal);
-
-  float sumMean = 0.0;
-  sumMean += avgPointsVector[0];
-  sumMean += avgPointsVector[1];
-  sumMean += avgPointsVector[2];
-  sumMean += avgPointsVector[3];
-  sumMean += avgPointsVector[4];
-  sumMean += avgPointsVector[5];
-  sumMean += avgPointsVector[6];
-  sumMean += avgPointsVector[7];
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    sumMean += realDataPoints[number];
-  }
-
-  // calculate the spectral mean
-  // +20 because for the comparison below we only want to throw out bins
-  // that are significantly higher (and would, thus, affect the mean more
-  const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
-
-  dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
-  __m256 vMeanAmplitudeVector = _mm256_set1_ps(meanAmplitude);
-  __m256 vOnesVector = _mm256_set1_ps(1.0);
-  __m256 vValidBinCount = _mm256_setzero_ps();
-  avgPointsVal = _mm256_setzero_ps();
-  __m256 compareMask;
-  number = 0;
-  // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
-  for(; number < eighthPoints; number++){
-
-    dataPointsVal = _mm256_loadu_ps(dataPointsPtr);
-
-    dataPointsPtr += 8;
-
-    // Identify which items do not exceed the mean amplitude
-    compareMask = _mm256_cmp_ps(dataPointsVal, vMeanAmplitudeVector, _CMP_LE_OQ);
-
-    // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude
-    avgPointsVal = _mm256_add_ps(avgPointsVal, _mm256_and_ps(compareMask, dataPointsVal));
-
-    // Count the number of bins which do not exceed the mean amplitude
-    vValidBinCount = _mm256_add_ps(vValidBinCount, _mm256_and_ps(compareMask, vOnesVector));
-  }
-
-  // Calculate the mean from the remaining data points
-  _mm256_storeu_ps(avgPointsVector, avgPointsVal);
-
-  sumMean = 0.0;
-  sumMean += avgPointsVector[0];
-  sumMean += avgPointsVector[1];
-  sumMean += avgPointsVector[2];
-  sumMean += avgPointsVector[3];
-  sumMean += avgPointsVector[4];
-  sumMean += avgPointsVector[5];
-  sumMean += avgPointsVector[6];
-  sumMean += avgPointsVector[7];
-
-  // Calculate the number of valid bins from the remaining count
-  __VOLK_ATTR_ALIGNED(16) float validBinCountVector[8];
-  _mm256_storeu_ps(validBinCountVector, vValidBinCount);
-
-  float validBinCount = 0;
-  validBinCount += validBinCountVector[0];
-  validBinCount += validBinCountVector[1];
-  validBinCount += validBinCountVector[2];
-  validBinCount += validBinCountVector[3];
-  validBinCount += validBinCountVector[4];
-  validBinCount += validBinCountVector[5];
-  validBinCount += validBinCountVector[6];
-  validBinCount += validBinCountVector[7];
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    if(realDataPoints[number] <= meanAmplitude){
-      sumMean += realDataPoints[number];
-      validBinCount += 1.0;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* dataPointsPtr = realDataPoints;
+    __VOLK_ATTR_ALIGNED(16) float avgPointsVector[8];
+
+    __m256 dataPointsVal;
+    __m256 avgPointsVal = _mm256_setzero_ps();
+    // Calculate the sum (for mean) for all points
+    for (; number < eighthPoints; number++) {
+
+        dataPointsVal = _mm256_loadu_ps(dataPointsPtr);
+
+        dataPointsPtr += 8;
+
+        avgPointsVal = _mm256_add_ps(avgPointsVal, dataPointsVal);
+    }
+
+    _mm256_storeu_ps(avgPointsVector, avgPointsVal);
+
+    float sumMean = 0.0;
+    sumMean += avgPointsVector[0];
+    sumMean += avgPointsVector[1];
+    sumMean += avgPointsVector[2];
+    sumMean += avgPointsVector[3];
+    sumMean += avgPointsVector[4];
+    sumMean += avgPointsVector[5];
+    sumMean += avgPointsVector[6];
+    sumMean += avgPointsVector[7];
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        sumMean += realDataPoints[number];
+    }
+
+    // calculate the spectral mean
+    // +20 because for the comparison below we only want to throw out bins
+    // that are significantly higher (and would, thus, affect the mean more
+    const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
+
+    dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
+    __m256 vMeanAmplitudeVector = _mm256_set1_ps(meanAmplitude);
+    __m256 vOnesVector = _mm256_set1_ps(1.0);
+    __m256 vValidBinCount = _mm256_setzero_ps();
+    avgPointsVal = _mm256_setzero_ps();
+    __m256 compareMask;
+    number = 0;
+    // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
+    for (; number < eighthPoints; number++) {
+
+        dataPointsVal = _mm256_loadu_ps(dataPointsPtr);
+
+        dataPointsPtr += 8;
+
+        // Identify which items do not exceed the mean amplitude
+        compareMask = _mm256_cmp_ps(dataPointsVal, vMeanAmplitudeVector, _CMP_LE_OQ);
+
+        // Mask off the items that exceed the mean amplitude and add the avg Points that
+        // do not exceed the mean amplitude
+        avgPointsVal =
+            _mm256_add_ps(avgPointsVal, _mm256_and_ps(compareMask, dataPointsVal));
+
+        // Count the number of bins which do not exceed the mean amplitude
+        vValidBinCount =
+            _mm256_add_ps(vValidBinCount, _mm256_and_ps(compareMask, vOnesVector));
+    }
+
+    // Calculate the mean from the remaining data points
+    _mm256_storeu_ps(avgPointsVector, avgPointsVal);
+
+    sumMean = 0.0;
+    sumMean += avgPointsVector[0];
+    sumMean += avgPointsVector[1];
+    sumMean += avgPointsVector[2];
+    sumMean += avgPointsVector[3];
+    sumMean += avgPointsVector[4];
+    sumMean += avgPointsVector[5];
+    sumMean += avgPointsVector[6];
+    sumMean += avgPointsVector[7];
+
+    // Calculate the number of valid bins from the remaining count
+    __VOLK_ATTR_ALIGNED(16) float validBinCountVector[8];
+    _mm256_storeu_ps(validBinCountVector, vValidBinCount);
+
+    float validBinCount = 0;
+    validBinCount += validBinCountVector[0];
+    validBinCount += validBinCountVector[1];
+    validBinCount += validBinCountVector[2];
+    validBinCount += validBinCountVector[3];
+    validBinCount += validBinCountVector[4];
+    validBinCount += validBinCountVector[5];
+    validBinCount += validBinCountVector[6];
+    validBinCount += validBinCountVector[7];
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        if (realDataPoints[number] <= meanAmplitude) {
+            sumMean += realDataPoints[number];
+            validBinCount += 1.0;
+        }
      }
-  }
  
-  float localNoiseFloorAmplitude = 0;
-  if(validBinCount > 0.0){
-    localNoiseFloorAmplitude = sumMean / validBinCount;
-  }
-  else{
-    localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal...
-  }
+    float localNoiseFloorAmplitude = 0;
+    if (validBinCount > 0.0) {
+        localNoiseFloorAmplitude = sumMean / validBinCount;
+    } else {
+        localNoiseFloorAmplitude =
+            meanAmplitude; // For the odd case that all the amplitudes are equal...
+    }
  
-  *noiseFloorAmplitude = localNoiseFloorAmplitude;
+    *noiseFloorAmplitude = localNoiseFloorAmplitude;
  }
  #endif /* LV_HAVE_AVX */
  #endif /* INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_u_H */
diff --git a/kernels/volk/volk_32f_s32f_convert_16i.h b/kernels/volk/volk_32f_s32f_convert_16i.h

index 27ef4d9849c0d649355804960648ea10df0fb3df..c9469b79859be98a98a7e71dcf0b9870abcc5d52 100644 (file)
--- a/kernels/volk/volk_32f_s32f_convert_16i.h
+++ b/kernels/volk/volk_32f_s32f_convert_16i.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_s32f_convert_16i(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points)
- * \endcode
+ * void volk_32f_s32f_convert_16i(int16_t* outputVector, const float* inputVector, const
+ * float scalar, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li inputVector: the input vector of floats.
@@ -42,11 +42,10 @@
   * \li outputVector: The output vector.
   *
   * \b Example
- * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest delta
- *  int N = 10;
- *   unsigned int alignment = volk_get_alignment();
- *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
- *   int16_t* out = (int16_t*)volk_malloc(sizeof(int16_t)*N, alignment);
+ * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest
+ * delta int N = 10; unsigned int alignment = volk_get_alignment(); float* increasing =
+ * (float*)volk_malloc(sizeof(float)*N, alignment); int16_t* out =
+ * (int16_t*)volk_malloc(sizeof(int16_t)*N, alignment);
   *
   *   for(unsigned int ii = 0; ii < N; ++ii){
   *       increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
@@ -76,55 +75,60 @@
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector, const float* inputVector,
-                                 const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector,
+                                                    const float* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
  {
-  unsigned int number = 0;
-
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  const float* inputVectorPtr = (const float*)inputVector;
-  int16_t* outputVectorPtr = outputVector;
-
-  float min_val = SHRT_MIN;
-  float max_val = SHRT_MAX;
-  float r;
-
-  __m256 vScalar = _mm256_set1_ps(scalar);
-  __m256 inputVal1, inputVal2;
-  __m256i intInputVal1, intInputVal2;
-  __m256 ret1, ret2;
-  __m256 vmin_val = _mm256_set1_ps(min_val);
-  __m256 vmax_val = _mm256_set1_ps(max_val);
-
-  for(;number < sixteenthPoints; number++){
-    inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
-    inputVal2 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
-
-    // Scale and clip
-    ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
-    ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
-
-    intInputVal1 = _mm256_cvtps_epi32(ret1);
-    intInputVal2 = _mm256_cvtps_epi32(ret2);
-
-    intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
-    intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
-
-    _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    outputVector[number] = (int16_t)rintf(r);
-  }
+    unsigned int number = 0;
+
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int16_t* outputVectorPtr = outputVector;
+
+    float min_val = SHRT_MIN;
+    float max_val = SHRT_MAX;
+    float r;
+
+    __m256 vScalar = _mm256_set1_ps(scalar);
+    __m256 inputVal1, inputVal2;
+    __m256i intInputVal1, intInputVal2;
+    __m256 ret1, ret2;
+    __m256 vmin_val = _mm256_set1_ps(min_val);
+    __m256 vmax_val = _mm256_set1_ps(max_val);
+
+    for (; number < sixteenthPoints; number++) {
+        inputVal1 = _mm256_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+        inputVal2 = _mm256_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+
+        // Scale and clip
+        ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
+                             vmin_val);
+        ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
+                             vmin_val);
+
+        intInputVal1 = _mm256_cvtps_epi32(ret1);
+        intInputVal2 = _mm256_cvtps_epi32(ret2);
+
+        intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+        intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+
+        _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int16_t)rintf(r);
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -132,54 +136,57 @@ volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector, const float* inputVector
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector, const float* inputVector,
-                                const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector,
+                                                   const float* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
+    unsigned int number = 0;
  
-  const unsigned int eighthPoints = num_points / 8;
+    const unsigned int eighthPoints = num_points / 8;
  
-  const float* inputVectorPtr = (const float*)inputVector;
-  int16_t* outputVectorPtr = outputVector;
+    const float* inputVectorPtr = (const float*)inputVector;
+    int16_t* outputVectorPtr = outputVector;
  
-  float min_val = SHRT_MIN;
-  float max_val = SHRT_MAX;
-  float r;
+    float min_val = SHRT_MIN;
+    float max_val = SHRT_MAX;
+    float r;
  
-  __m256 vScalar = _mm256_set1_ps(scalar);
-  __m256 inputVal, ret;
-  __m256i intInputVal;
-  __m128i intInputVal1, intInputVal2;
-  __m256 vmin_val = _mm256_set1_ps(min_val);
-  __m256 vmax_val = _mm256_set1_ps(max_val);
+    __m256 vScalar = _mm256_set1_ps(scalar);
+    __m256 inputVal, ret;
+    __m256i intInputVal;
+    __m128i intInputVal1, intInputVal2;
+    __m256 vmin_val = _mm256_set1_ps(min_val);
+    __m256 vmax_val = _mm256_set1_ps(max_val);
  
-  for(;number < eighthPoints; number++){
-    inputVal = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
+    for (; number < eighthPoints; number++) {
+        inputVal = _mm256_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 8;
  
-    // Scale and clip
-    ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), vmin_val);
+        // Scale and clip
+        ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
+                            vmin_val);
  
-    intInputVal = _mm256_cvtps_epi32(ret);
+        intInputVal = _mm256_cvtps_epi32(ret);
  
-    intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
-    intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
+        intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
+        intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
  
-    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+        intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
  
-    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 8;
-  }
+        _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    outputVector[number] = (int16_t)rintf(r);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int16_t)rintf(r);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
@@ -187,54 +194,57 @@ volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector, const float* inputVector,
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void
-volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector,
-                                 const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector,
+                                                    const float* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
  {
-  unsigned int number = 0;
-
-  const unsigned int eighthPoints = num_points / 8;
-
-  const float* inputVectorPtr = (const float*)inputVector;
-  int16_t* outputVectorPtr = outputVector;
-
-  float min_val = SHRT_MIN;
-  float max_val = SHRT_MAX;
-  float r;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1, inputVal2;
-  __m128i intInputVal1, intInputVal2;
-  __m128 ret1, ret2;
-  __m128 vmin_val = _mm_set_ps1(min_val);
-  __m128 vmax_val = _mm_set_ps1(max_val);
-
-  for(;number < eighthPoints; number++){
-    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    // Scale and clip
-    ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
-    ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
-
-    intInputVal1 = _mm_cvtps_epi32(ret1);
-    intInputVal2 = _mm_cvtps_epi32(ret2);
-
-    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-
-    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    outputVector[number] = (int16_t)rintf(r);
-  }
+    unsigned int number = 0;
+
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int16_t* outputVectorPtr = outputVector;
+
+    float min_val = SHRT_MIN;
+    float max_val = SHRT_MAX;
+    float r;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 inputVal1, inputVal2;
+    __m128i intInputVal1, intInputVal2;
+    __m128 ret1, ret2;
+    __m128 vmin_val = _mm_set_ps1(min_val);
+    __m128 vmax_val = _mm_set_ps1(max_val);
+
+    for (; number < eighthPoints; number++) {
+        inputVal1 = _mm_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+        inputVal2 = _mm_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        // Scale and clip
+        ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+        ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+
+        intInputVal1 = _mm_cvtps_epi32(ret1);
+        intInputVal2 = _mm_cvtps_epi32(ret2);
+
+        intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+        _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int16_t)rintf(r);
+    }
  }
  #endif /* LV_HAVE_SSE2 */
  
@@ -242,76 +252,78 @@ volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector,
-                                const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector,
+                                                   const float* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* inputVectorPtr = (const float*)inputVector;
-  int16_t* outputVectorPtr = outputVector;
-
-  float min_val = SHRT_MIN;
-  float max_val = SHRT_MAX;
-  float r;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-  __m128 vmin_val = _mm_set_ps1(min_val);
-  __m128 vmax_val = _mm_set_ps1(max_val);
-
-  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_loadu_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    // Scale and clip
-    ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    outputVector[number] = (int16_t)rintf(r);
-  }
+    unsigned int number = 0;
+
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int16_t* outputVectorPtr = outputVector;
+
+    float min_val = SHRT_MIN;
+    float max_val = SHRT_MAX;
+    float r;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 ret;
+    __m128 vmin_val = _mm_set_ps1(min_val);
+    __m128 vmax_val = _mm_set_ps1(max_val);
+
+    __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+    for (; number < quarterPoints; number++) {
+        ret = _mm_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        // Scale and clip
+        ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+        _mm_store_ps(outputFloatBuffer, ret);
+        *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+        *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+        *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+        *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int16_t)rintf(r);
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector,
-                                  const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_16i_generic(int16_t* outputVector,
+                                                     const float* inputVector,
+                                                     const float scalar,
+                                                     unsigned int num_points)
  {
-  int16_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  float min_val = SHRT_MIN;
-  float max_val = SHRT_MAX;
-  float r;
-
-  for(number = 0; number < num_points; number++){
-    r = *inputVectorPtr++  * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    *outputVectorPtr++ = (int16_t)rintf(r);
-  }
+    int16_t* outputVectorPtr = outputVector;
+    const float* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+    float min_val = SHRT_MIN;
+    float max_val = SHRT_MAX;
+    float r;
+
+    for (number = 0; number < num_points; number++) {
+        r = *inputVectorPtr++ * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        *outputVectorPtr++ = (int16_t)rintf(r);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -320,63 +332,68 @@ volk_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVecto
  #ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H
  #define INCLUDED_volk_32f_s32f_convert_16i_a_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
-#include <stdio.h>
  #include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector, const float* inputVector,
-                                 const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector,
+                                                    const float* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
  {
-  unsigned int number = 0;
-
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  const float* inputVectorPtr = (const float*)inputVector;
-  int16_t* outputVectorPtr = outputVector;
-
-  float min_val = SHRT_MIN;
-  float max_val = SHRT_MAX;
-  float r;
-
-  __m256 vScalar = _mm256_set1_ps(scalar);
-  __m256 inputVal1, inputVal2;
-  __m256i intInputVal1, intInputVal2;
-  __m256 ret1, ret2;
-  __m256 vmin_val = _mm256_set1_ps(min_val);
-  __m256 vmax_val = _mm256_set1_ps(max_val);
-
-  for(;number < sixteenthPoints; number++){
-    inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
-    inputVal2 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
-
-    // Scale and clip
-    ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
-    ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
-
-    intInputVal1 = _mm256_cvtps_epi32(ret1);
-    intInputVal2 = _mm256_cvtps_epi32(ret2);
-
-    intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
-    intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
-
-    _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    outputVector[number] = (int16_t)rintf(r);
-  }
+    unsigned int number = 0;
+
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int16_t* outputVectorPtr = outputVector;
+
+    float min_val = SHRT_MIN;
+    float max_val = SHRT_MAX;
+    float r;
+
+    __m256 vScalar = _mm256_set1_ps(scalar);
+    __m256 inputVal1, inputVal2;
+    __m256i intInputVal1, intInputVal2;
+    __m256 ret1, ret2;
+    __m256 vmin_val = _mm256_set1_ps(min_val);
+    __m256 vmax_val = _mm256_set1_ps(max_val);
+
+    for (; number < sixteenthPoints; number++) {
+        inputVal1 = _mm256_load_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+        inputVal2 = _mm256_load_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+
+        // Scale and clip
+        ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
+                             vmin_val);
+        ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
+                             vmin_val);
+
+        intInputVal1 = _mm256_cvtps_epi32(ret1);
+        intInputVal2 = _mm256_cvtps_epi32(ret2);
+
+        intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+        intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+
+        _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int16_t)rintf(r);
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -384,108 +401,114 @@ volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector, const float* inputVector
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_s32f_convert_16i_a_avx(int16_t* outputVector, const float* inputVector,
-                                const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_16i_a_avx(int16_t* outputVector,
+                                                   const float* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
+    unsigned int number = 0;
  
-  const unsigned int eighthPoints = num_points / 8;
+    const unsigned int eighthPoints = num_points / 8;
  
-  const float* inputVectorPtr = (const float*)inputVector;
-  int16_t* outputVectorPtr = outputVector;
+    const float* inputVectorPtr = (const float*)inputVector;
+    int16_t* outputVectorPtr = outputVector;
  
-  float min_val = SHRT_MIN;
-  float max_val = SHRT_MAX;
-  float r;
+    float min_val = SHRT_MIN;
+    float max_val = SHRT_MAX;
+    float r;
  
-  __m256 vScalar = _mm256_set1_ps(scalar);
-  __m256 inputVal, ret;
-  __m256i intInputVal;
-  __m128i intInputVal1, intInputVal2;
-  __m256 vmin_val = _mm256_set1_ps(min_val);
-  __m256 vmax_val = _mm256_set1_ps(max_val);
+    __m256 vScalar = _mm256_set1_ps(scalar);
+    __m256 inputVal, ret;
+    __m256i intInputVal;
+    __m128i intInputVal1, intInputVal2;
+    __m256 vmin_val = _mm256_set1_ps(min_val);
+    __m256 vmax_val = _mm256_set1_ps(max_val);
  
-  for(;number < eighthPoints; number++){
-    inputVal = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
+    for (; number < eighthPoints; number++) {
+        inputVal = _mm256_load_ps(inputVectorPtr);
+        inputVectorPtr += 8;
  
-    // Scale and clip
-    ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), vmin_val);
+        // Scale and clip
+        ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
+                            vmin_val);
  
-    intInputVal = _mm256_cvtps_epi32(ret);
+        intInputVal = _mm256_cvtps_epi32(ret);
  
-    intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
-    intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
+        intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
+        intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
  
-    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+        intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
  
-    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 8;
-  }
+        _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    outputVector[number] = (int16_t)rintf(r);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int16_t)rintf(r);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void
-volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector,
-                                 const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector,
+                                                    const float* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
  {
-  unsigned int number = 0;
-
-  const unsigned int eighthPoints = num_points / 8;
-
-  const float* inputVectorPtr = (const float*)inputVector;
-  int16_t* outputVectorPtr = outputVector;
-
-  float min_val = SHRT_MIN;
-  float max_val = SHRT_MAX;
-  float r;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1, inputVal2;
-  __m128i intInputVal1, intInputVal2;
-  __m128 ret1, ret2;
-  __m128 vmin_val = _mm_set_ps1(min_val);
-  __m128 vmax_val = _mm_set_ps1(max_val);
-
-  for(;number < eighthPoints; number++){
-    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    // Scale and clip
-    ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
-    ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
-
-    intInputVal1 = _mm_cvtps_epi32(ret1);
-    intInputVal2 = _mm_cvtps_epi32(ret2);
-
-    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-
-    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    outputVector[number] = (int16_t)rintf(r);
-  }
+    unsigned int number = 0;
+
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int16_t* outputVectorPtr = outputVector;
+
+    float min_val = SHRT_MIN;
+    float max_val = SHRT_MAX;
+    float r;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 inputVal1, inputVal2;
+    __m128i intInputVal1, intInputVal2;
+    __m128 ret1, ret2;
+    __m128 vmin_val = _mm_set_ps1(min_val);
+    __m128 vmax_val = _mm_set_ps1(max_val);
+
+    for (; number < eighthPoints; number++) {
+        inputVal1 = _mm_load_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+        inputVal2 = _mm_load_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        // Scale and clip
+        ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+        ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+
+        intInputVal1 = _mm_cvtps_epi32(ret1);
+        intInputVal2 = _mm_cvtps_epi32(ret2);
+
+        intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+        _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int16_t)rintf(r);
+    }
  }
  #endif /* LV_HAVE_SSE2 */
  
@@ -493,76 +516,78 @@ volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector,
-                                const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector,
+                                                   const float* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* inputVectorPtr = (const float*)inputVector;
-  int16_t* outputVectorPtr = outputVector;
-
-  float min_val = SHRT_MIN;
-  float max_val = SHRT_MAX;
-  float r;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-  __m128 vmin_val = _mm_set_ps1(min_val);
-  __m128 vmax_val = _mm_set_ps1(max_val);
-
-  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_load_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    // Scale and clip
-    ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    outputVector[number] = (int16_t)rintf(r);
-  }
+    unsigned int number = 0;
+
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int16_t* outputVectorPtr = outputVector;
+
+    float min_val = SHRT_MIN;
+    float max_val = SHRT_MAX;
+    float r;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 ret;
+    __m128 vmin_val = _mm_set_ps1(min_val);
+    __m128 vmax_val = _mm_set_ps1(max_val);
+
+    __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+    for (; number < quarterPoints; number++) {
+        ret = _mm_load_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        // Scale and clip
+        ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+        _mm_store_ps(outputFloatBuffer, ret);
+        *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+        *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+        *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+        *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int16_t)rintf(r);
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector,
-                                    const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector,
+                                                       const float* inputVector,
+                                                       const float scalar,
+                                                       unsigned int num_points)
  {
-  int16_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  float min_val = SHRT_MIN;
-  float max_val = SHRT_MAX;
-  float r;
-
-  for(number = 0; number < num_points; number++){
-    r  = *inputVectorPtr++ * scalar;
-    if(r < min_val)
-      r = min_val;
-    else if(r > max_val)
-      r = max_val;
-    *outputVectorPtr++ = (int16_t)rintf(r);
-  }
+    int16_t* outputVectorPtr = outputVector;
+    const float* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+    float min_val = SHRT_MIN;
+    float max_val = SHRT_MAX;
+    float r;
+
+    for (number = 0; number < num_points; number++) {
+        r = *inputVectorPtr++ * scalar;
+        if (r < min_val)
+            r = min_val;
+        else if (r > max_val)
+            r = max_val;
+        *outputVectorPtr++ = (int16_t)rintf(r);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
diff --git a/kernels/volk/volk_32f_s32f_convert_32i.h b/kernels/volk/volk_32f_s32f_convert_32i.h

index d2a65a09ae7651f650199c41dbd17434e2e66bff..d5f7cd463514882f110b3155799dfe3ccd7265e8 100644 (file)
--- a/kernels/volk/volk_32f_s32f_convert_32i.h
+++ b/kernels/volk/volk_32f_s32f_convert_32i.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_s32f_convert_32i(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points)
- * \endcode
+ * void volk_32f_s32f_convert_32i(int32_t* outputVector, const float* inputVector, const
+ * float scalar, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li inputVector: the input vector of floats.
@@ -77,46 +77,49 @@
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_s32f_convert_32i_u_avx(int32_t* outputVector, const float* inputVector,
-                                const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_32i_u_avx(int32_t* outputVector,
+                                                   const float* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
-
-  const unsigned int eighthPoints = num_points / 8;
-
-  const float* inputVectorPtr = (const float*)inputVector;
-  int32_t* outputVectorPtr = outputVector;
-
-  float min_val = INT_MIN;
-  float max_val = INT_MAX;
-  float r;
-
-  __m256 vScalar = _mm256_set1_ps(scalar);
-  __m256 inputVal1;
-  __m256i intInputVal1;
-  __m256 vmin_val = _mm256_set1_ps(min_val);
-  __m256 vmax_val = _mm256_set1_ps(max_val);
-
-  for(;number < eighthPoints; number++){
-    inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
-
-    inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
-    intInputVal1 = _mm256_cvtps_epi32(inputVal1);
-
-    _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    outputVector[number] = (int32_t)rintf(r);
-  }
+    unsigned int number = 0;
+
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int32_t* outputVectorPtr = outputVector;
+
+    float min_val = INT_MIN;
+    float max_val = INT_MAX;
+    float r;
+
+    __m256 vScalar = _mm256_set1_ps(scalar);
+    __m256 inputVal1;
+    __m256i intInputVal1;
+    __m256 vmin_val = _mm256_set1_ps(min_val);
+    __m256 vmax_val = _mm256_set1_ps(max_val);
+
+    for (; number < eighthPoints; number++) {
+        inputVal1 = _mm256_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+
+        inputVal1 = _mm256_max_ps(
+            _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+        intInputVal1 = _mm256_cvtps_epi32(inputVal1);
+
+        _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int32_t)rintf(r);
+    }
  }
  
  #endif /* LV_HAVE_AVX */
@@ -124,46 +127,49 @@ volk_32f_s32f_convert_32i_u_avx(int32_t* outputVector, const float* inputVector,
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void
-volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector,
-                                 const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector,
+                                                    const float* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
  {
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* inputVectorPtr = (const float*)inputVector;
-  int32_t* outputVectorPtr = outputVector;
-
-  float min_val = INT_MIN;
-  float max_val = INT_MAX;
-  float r;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1;
-  __m128i intInputVal1;
-  __m128 vmin_val = _mm_set_ps1(min_val);
-  __m128 vmax_val = _mm_set_ps1(max_val);
-
-  for(;number < quarterPoints; number++){
-    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
-    intInputVal1 = _mm_cvtps_epi32(inputVal1);
-
-    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    outputVector[number] = (int32_t)rintf(r);
-  }
+    unsigned int number = 0;
+
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int32_t* outputVectorPtr = outputVector;
+
+    float min_val = INT_MIN;
+    float max_val = INT_MAX;
+    float r;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 inputVal1;
+    __m128i intInputVal1;
+    __m128 vmin_val = _mm_set_ps1(min_val);
+    __m128 vmax_val = _mm_set_ps1(max_val);
+
+    for (; number < quarterPoints; number++) {
+        inputVal1 = _mm_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        inputVal1 =
+            _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+        intInputVal1 = _mm_cvtps_epi32(inputVal1);
+
+        _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int32_t)rintf(r);
+    }
  }
  
  #endif /* LV_HAVE_SSE2 */
@@ -172,50 +178,51 @@ volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector,
-                                const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector,
+                                                   const float* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* inputVectorPtr = (const float*)inputVector;
-  int32_t* outputVectorPtr = outputVector;
-
-  float min_val = INT_MIN;
-  float max_val = INT_MAX;
-  float r;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-  __m128 vmin_val = _mm_set_ps1(min_val);
-  __m128 vmax_val = _mm_set_ps1(max_val);
-
-  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_loadu_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    outputVector[number] = (int32_t)rintf(r);
-  }
+    unsigned int number = 0;
+
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int32_t* outputVectorPtr = outputVector;
+
+    float min_val = INT_MIN;
+    float max_val = INT_MAX;
+    float r;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 ret;
+    __m128 vmin_val = _mm_set_ps1(min_val);
+    __m128 vmax_val = _mm_set_ps1(max_val);
+
+    __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+    for (; number < quarterPoints; number++) {
+        ret = _mm_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+        _mm_store_ps(outputFloatBuffer, ret);
+        *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
+        *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
+        *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
+        *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int32_t)rintf(r);
+    }
  }
  
  #endif /* LV_HAVE_SSE */
@@ -223,82 +230,85 @@ volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector,
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_s32f_convert_32i_generic(int32_t* outputVector, const float* inputVector,
-                                  const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_32i_generic(int32_t* outputVector,
+                                                     const float* inputVector,
+                                                     const float scalar,
+                                                     unsigned int num_points)
  {
-  int32_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  float min_val = INT_MIN;
-  float max_val = INT_MAX;
-  float r;
-
-  for(number = 0; number < num_points; number++){
-    r = *inputVectorPtr++ * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    *outputVectorPtr++ = (int32_t)rintf(r);
-  }
+    int32_t* outputVectorPtr = outputVector;
+    const float* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+    float min_val = INT_MIN;
+    float max_val = INT_MAX;
+    float r;
+
+    for (number = 0; number < num_points; number++) {
+        r = *inputVectorPtr++ * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        *outputVectorPtr++ = (int32_t)rintf(r);
+    }
  }
  
  #endif /* LV_HAVE_GENERIC */
  
  
-
  #endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */
  #ifndef INCLUDED_volk_32f_s32f_convert_32i_a_H
  #define INCLUDED_volk_32f_s32f_convert_32i_a_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
  #include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const float* inputVector,
-                                const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector,
+                                                   const float* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
-
-  const unsigned int eighthPoints = num_points / 8;
-
-  const float* inputVectorPtr = (const float*)inputVector;
-  int32_t* outputVectorPtr = outputVector;
-
-  float min_val = INT_MIN;
-  float max_val = INT_MAX;
-  float r;
-
-  __m256 vScalar = _mm256_set1_ps(scalar);
-  __m256 inputVal1;
-  __m256i intInputVal1;
-  __m256 vmin_val = _mm256_set1_ps(min_val);
-  __m256 vmax_val = _mm256_set1_ps(max_val);
-
-  for(;number < eighthPoints; number++){
-    inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
-
-    inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
-    intInputVal1 = _mm256_cvtps_epi32(inputVal1);
-
-    _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    outputVector[number] = (int32_t)rintf(r);
-  }
+    unsigned int number = 0;
+
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int32_t* outputVectorPtr = outputVector;
+
+    float min_val = INT_MIN;
+    float max_val = INT_MAX;
+    float r;
+
+    __m256 vScalar = _mm256_set1_ps(scalar);
+    __m256 inputVal1;
+    __m256i intInputVal1;
+    __m256 vmin_val = _mm256_set1_ps(min_val);
+    __m256 vmax_val = _mm256_set1_ps(max_val);
+
+    for (; number < eighthPoints; number++) {
+        inputVal1 = _mm256_load_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+
+        inputVal1 = _mm256_max_ps(
+            _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+        intInputVal1 = _mm256_cvtps_epi32(inputVal1);
+
+        _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int32_t)rintf(r);
+    }
  }
  
  #endif /* LV_HAVE_AVX */
@@ -307,46 +317,49 @@ volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const float* inputVector,
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void
-volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const float* inputVector,
-                                 const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector,
+                                                    const float* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
  {
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* inputVectorPtr = (const float*)inputVector;
-  int32_t* outputVectorPtr = outputVector;
-
-  float min_val = INT_MIN;
-  float max_val = INT_MAX;
-  float r;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1;
-  __m128i intInputVal1;
-  __m128 vmin_val = _mm_set_ps1(min_val);
-  __m128 vmax_val = _mm_set_ps1(max_val);
-
-  for(;number < quarterPoints; number++){
-    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
-    intInputVal1 = _mm_cvtps_epi32(inputVal1);
-
-    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    outputVector[number] = (int32_t)rintf(r);
-  }
+    unsigned int number = 0;
+
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int32_t* outputVectorPtr = outputVector;
+
+    float min_val = INT_MIN;
+    float max_val = INT_MAX;
+    float r;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 inputVal1;
+    __m128i intInputVal1;
+    __m128 vmin_val = _mm_set_ps1(min_val);
+    __m128 vmax_val = _mm_set_ps1(max_val);
+
+    for (; number < quarterPoints; number++) {
+        inputVal1 = _mm_load_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        inputVal1 =
+            _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+        intInputVal1 = _mm_cvtps_epi32(inputVal1);
+
+        _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int32_t)rintf(r);
+    }
  }
  
  #endif /* LV_HAVE_SSE2 */
@@ -355,50 +368,51 @@ volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const float* inputVector
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const float* inputVector,
-                                const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector,
+                                                   const float* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* inputVectorPtr = (const float*)inputVector;
-  int32_t* outputVectorPtr = outputVector;
-
-  float min_val = INT_MIN;
-  float max_val = INT_MAX;
-  float r;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-  __m128 vmin_val = _mm_set_ps1(min_val);
-  __m128 vmax_val = _mm_set_ps1(max_val);
-
-  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_load_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    outputVector[number] = (int32_t)rintf(r);
-  }
+    unsigned int number = 0;
+
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int32_t* outputVectorPtr = outputVector;
+
+    float min_val = INT_MIN;
+    float max_val = INT_MAX;
+    float r;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 ret;
+    __m128 vmin_val = _mm_set_ps1(min_val);
+    __m128 vmax_val = _mm_set_ps1(max_val);
+
+    __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+    for (; number < quarterPoints; number++) {
+        ret = _mm_load_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+        _mm_store_ps(outputFloatBuffer, ret);
+        *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
+        *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
+        *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
+        *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int32_t)rintf(r);
+    }
  }
  
  #endif /* LV_HAVE_SSE */
@@ -406,25 +420,26 @@ volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const float* inputVector,
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector, const float* inputVector,
-                                    const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector,
+                                                       const float* inputVector,
+                                                       const float scalar,
+                                                       unsigned int num_points)
  {
-  int32_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  float min_val = INT_MIN;
-  float max_val = INT_MAX;
-  float r;
-
-  for(number = 0; number < num_points; number++){
-    r = *inputVectorPtr++ * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    *outputVectorPtr++ = (int32_t)rintf(r);
-  }
+    int32_t* outputVectorPtr = outputVector;
+    const float* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+    float min_val = INT_MIN;
+    float max_val = INT_MAX;
+    float r;
+
+    for (number = 0; number < num_points; number++) {
+        r = *inputVectorPtr++ * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        *outputVectorPtr++ = (int32_t)rintf(r);
+    }
  }
  
  #endif /* LV_HAVE_GENERIC */
diff --git a/kernels/volk/volk_32f_s32f_convert_8i.h b/kernels/volk/volk_32f_s32f_convert_8i.h

index 2a1669cd540e2096517a5f279d6aafd9311402b1..242c3bd358d29a7d54c65fa83d4d1db991ca4d78 100644 (file)
--- a/kernels/volk/volk_32f_s32f_convert_8i.h
+++ b/kernels/volk/volk_32f_s32f_convert_8i.h
@@ -30,7 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_s32f_convert_8i(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points)
+ * void volk_32f_s32f_convert_8i(int8_t* outputVector, const float* inputVector, const
+ float scalar, unsigned int num_points)
   * \endcode
   *
   * \b Inputs
@@ -42,7 +43,8 @@
   * \li outputVector: The output vector.
   *
   * \b Example
- * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest delta
+ * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest
+ delta
   *  int N = 10;
   *   unsigned int alignment = volk_get_alignment();
   *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
@@ -74,77 +76,86 @@
  #include <inttypes.h>
  #include <stdio.h>
  
-static inline void
-volk_32f_s32f_convert_8i_single(int8_t* out, const float in){
-  float min_val = CHAR_MIN;
-  float max_val = CHAR_MAX;
-  if(in > max_val){
-    *out = (int8_t)(max_val);
-  }else if(in < min_val){
-    *out = (int8_t)(min_val);
-  }else{
-    *out = (int8_t)(rintf(in));
-  }
+static inline void volk_32f_s32f_convert_8i_single(int8_t* out, const float in)
+{
+    float min_val = CHAR_MIN;
+    float max_val = CHAR_MAX;
+    if (in > max_val) {
+        *out = (int8_t)(max_val);
+    } else if (in < min_val) {
+        *out = (int8_t)(min_val);
+    } else {
+        *out = (int8_t)(rintf(in));
+    }
  }
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector, const float* inputVector,
-                                const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector,
+                                                   const float* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
-
-  const unsigned int thirtysecondPoints = num_points / 32;
-
-  const float* inputVectorPtr = (const float*)inputVector;
-  int8_t* outputVectorPtr = outputVector;
-
-  float min_val = CHAR_MIN;
-  float max_val = CHAR_MAX;
-  float r;
-
-  __m256 vScalar = _mm256_set1_ps(scalar);
-  __m256 inputVal1, inputVal2, inputVal3, inputVal4;
-  __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
-  __m256 vmin_val = _mm256_set1_ps(min_val);
-  __m256 vmax_val = _mm256_set1_ps(max_val);
-  __m256i intInputVal;
-
-  for(;number < thirtysecondPoints; number++){
-    inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
-    inputVal2 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
-    inputVal3 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
-    inputVal4 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
-
-    inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
-    inputVal2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
-    inputVal3 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
-    inputVal4 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
-
-    intInputVal1 = _mm256_cvtps_epi32(inputVal1);
-    intInputVal2 = _mm256_cvtps_epi32(inputVal2);
-    intInputVal3 = _mm256_cvtps_epi32(inputVal3);
-    intInputVal4 = _mm256_cvtps_epi32(inputVal4);
-
-    intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
-    intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
-    intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
-    intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
-
-    intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
-    intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
-
-    _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
-    outputVectorPtr += 32;
-  }
-
-  number = thirtysecondPoints * 32;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    volk_32f_s32f_convert_8i_single(&outputVector[number], r);
-  }
+    unsigned int number = 0;
+
+    const unsigned int thirtysecondPoints = num_points / 32;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int8_t* outputVectorPtr = outputVector;
+
+    float min_val = CHAR_MIN;
+    float max_val = CHAR_MAX;
+    float r;
+
+    __m256 vScalar = _mm256_set1_ps(scalar);
+    __m256 inputVal1, inputVal2, inputVal3, inputVal4;
+    __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+    __m256 vmin_val = _mm256_set1_ps(min_val);
+    __m256 vmax_val = _mm256_set1_ps(max_val);
+    __m256i intInputVal;
+
+    for (; number < thirtysecondPoints; number++) {
+        inputVal1 = _mm256_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+        inputVal2 = _mm256_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+        inputVal3 = _mm256_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+        inputVal4 = _mm256_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+
+        inputVal1 = _mm256_max_ps(
+            _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+        inputVal2 = _mm256_max_ps(
+            _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+        inputVal3 = _mm256_max_ps(
+            _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+        inputVal4 = _mm256_max_ps(
+            _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+
+        intInputVal1 = _mm256_cvtps_epi32(inputVal1);
+        intInputVal2 = _mm256_cvtps_epi32(inputVal2);
+        intInputVal3 = _mm256_cvtps_epi32(inputVal3);
+        intInputVal4 = _mm256_cvtps_epi32(inputVal4);
+
+        intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+        intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+        intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
+        intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
+
+        intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
+        intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+
+        _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
+        outputVectorPtr += 32;
+    }
+
+    number = thirtysecondPoints * 32;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+    }
  }
  
  #endif /* LV_HAVE_AVX2 */
@@ -153,57 +164,66 @@ volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector, const float* inputVector,
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void
-volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const float* inputVector,
-                                const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector,
+                                                   const float* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
-
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  const float* inputVectorPtr = (const float*)inputVector;
-  int8_t* outputVectorPtr = outputVector;
-
-  float min_val = CHAR_MIN;
-  float max_val = CHAR_MAX;
-  float r;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1, inputVal2, inputVal3, inputVal4;
-  __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
-  __m128 vmin_val = _mm_set_ps1(min_val);
-  __m128 vmax_val = _mm_set_ps1(max_val);
-
-  for(;number < sixteenthPoints; number++){
-    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
-    inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
-    inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
-    inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
-
-    intInputVal1 = _mm_cvtps_epi32(inputVal1);
-    intInputVal2 = _mm_cvtps_epi32(inputVal2);
-    intInputVal3 = _mm_cvtps_epi32(inputVal3);
-    intInputVal4 = _mm_cvtps_epi32(inputVal4);
-
-    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-    intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
-
-    intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
-
-    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 16;
-  }
+    unsigned int number = 0;
+
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int8_t* outputVectorPtr = outputVector;
+
+    float min_val = CHAR_MIN;
+    float max_val = CHAR_MAX;
+    float r;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+    __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+    __m128 vmin_val = _mm_set_ps1(min_val);
+    __m128 vmax_val = _mm_set_ps1(max_val);
+
+    for (; number < sixteenthPoints; number++) {
+        inputVal1 = _mm_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+        inputVal2 = _mm_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+        inputVal3 = _mm_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+        inputVal4 = _mm_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        inputVal1 =
+            _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+        inputVal2 =
+            _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+        inputVal3 =
+            _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+        inputVal4 =
+            _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+
+        intInputVal1 = _mm_cvtps_epi32(inputVal1);
+        intInputVal2 = _mm_cvtps_epi32(inputVal2);
+        intInputVal3 = _mm_cvtps_epi32(inputVal3);
+        intInputVal4 = _mm_cvtps_epi32(inputVal4);
+
+        intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+        intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+
+        intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+
+        _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 16;
+    }
  
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    volk_32f_s32f_convert_8i_single(&outputVector[number], r);
-  }
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+    }
  }
  
  #endif /* LV_HAVE_SSE2 */
@@ -212,46 +232,47 @@ volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const float* inputVector,
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const float* inputVector,
-                               const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector,
+                                                  const float* inputVector,
+                                                  const float scalar,
+                                                  unsigned int num_points)
  {
-  unsigned int number = 0;
-  size_t inner_loop;
+    unsigned int number = 0;
+    size_t inner_loop;
  
-  const unsigned int quarterPoints = num_points / 4;
+    const unsigned int quarterPoints = num_points / 4;
  
-  const float* inputVectorPtr = (const float*)inputVector;
-  int8_t* outputVectorPtr = outputVector;
+    const float* inputVectorPtr = (const float*)inputVector;
+    int8_t* outputVectorPtr = outputVector;
  
-  float min_val = CHAR_MIN;
-  float max_val = CHAR_MAX;
-  float r;
+    float min_val = CHAR_MIN;
+    float max_val = CHAR_MAX;
+    float r;
  
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-  __m128 vmin_val = _mm_set_ps1(min_val);
-  __m128 vmax_val = _mm_set_ps1(max_val);
+    __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 ret;
+    __m128 vmin_val = _mm_set_ps1(min_val);
+    __m128 vmax_val = _mm_set_ps1(max_val);
  
-  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+    __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
  
-  for(;number < quarterPoints; number++){
-    ret = _mm_loadu_ps(inputVectorPtr);
-    inputVectorPtr += 4;
+    for (; number < quarterPoints; number++) {
+        ret = _mm_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 4;
  
-    ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+        ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
  
-    _mm_store_ps(outputFloatBuffer, ret);
-    for (inner_loop = 0; inner_loop < 4; inner_loop++){
-      *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
+        _mm_store_ps(outputFloatBuffer, ret);
+        for (inner_loop = 0; inner_loop < 4; inner_loop++) {
+            *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
+        }
      }
-  }
  
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    volk_32f_s32f_convert_8i_single(&outputVector[number], r);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+    }
  }
  
  #endif /* LV_HAVE_SSE */
@@ -259,18 +280,19 @@ volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const float* inputVector,
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_s32f_convert_8i_generic(int8_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_8i_generic(int8_t* outputVector,
+                                                    const float* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
  {
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  float r;
-
-  for(number = 0; number < num_points; number++){
-    r = *inputVectorPtr++ * scalar;
-    volk_32f_s32f_convert_8i_single(&outputVector[number], r);
-  }
+    const float* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+    float r;
+
+    for (number = 0; number < num_points; number++) {
+        r = *inputVectorPtr++ * scalar;
+        volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+    }
  }
  
  #endif /* LV_HAVE_GENERIC */
@@ -280,68 +302,77 @@ volk_32f_s32f_convert_8i_generic(int8_t* outputVector, const float* inputVector,
  #ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
  #define INCLUDED_volk_32f_s32f_convert_8i_a_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
  #include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector, const float* inputVector,
-                                const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector,
+                                                   const float* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
-
-  const unsigned int thirtysecondPoints = num_points / 32;
-
-  const float* inputVectorPtr = (const float*)inputVector;
-  int8_t* outputVectorPtr = outputVector;
-
-  float min_val = CHAR_MIN;
-  float max_val = CHAR_MAX;
-  float r;
-
-  __m256 vScalar = _mm256_set1_ps(scalar);
-  __m256 inputVal1, inputVal2, inputVal3, inputVal4;
-  __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
-  __m256 vmin_val = _mm256_set1_ps(min_val);
-  __m256 vmax_val = _mm256_set1_ps(max_val);
-  __m256i intInputVal;
-
-  for(;number < thirtysecondPoints; number++){
-    inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
-    inputVal2 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
-    inputVal3 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
-    inputVal4 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
-
-    inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
-    inputVal2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
-    inputVal3 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
-    inputVal4 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
-
-    intInputVal1 = _mm256_cvtps_epi32(inputVal1);
-    intInputVal2 = _mm256_cvtps_epi32(inputVal2);
-    intInputVal3 = _mm256_cvtps_epi32(inputVal3);
-    intInputVal4 = _mm256_cvtps_epi32(inputVal4);
-
-    intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
-    intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
-    intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
-    intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
-
-    intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
-    intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
-
-    _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
-    outputVectorPtr += 32;
-  }
-
-  number = thirtysecondPoints * 32;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    volk_32f_s32f_convert_8i_single(&outputVector[number], r);
-  }
+    unsigned int number = 0;
+
+    const unsigned int thirtysecondPoints = num_points / 32;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int8_t* outputVectorPtr = outputVector;
+
+    float min_val = CHAR_MIN;
+    float max_val = CHAR_MAX;
+    float r;
+
+    __m256 vScalar = _mm256_set1_ps(scalar);
+    __m256 inputVal1, inputVal2, inputVal3, inputVal4;
+    __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+    __m256 vmin_val = _mm256_set1_ps(min_val);
+    __m256 vmax_val = _mm256_set1_ps(max_val);
+    __m256i intInputVal;
+
+    for (; number < thirtysecondPoints; number++) {
+        inputVal1 = _mm256_load_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+        inputVal2 = _mm256_load_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+        inputVal3 = _mm256_load_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+        inputVal4 = _mm256_load_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+
+        inputVal1 = _mm256_max_ps(
+            _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+        inputVal2 = _mm256_max_ps(
+            _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+        inputVal3 = _mm256_max_ps(
+            _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+        inputVal4 = _mm256_max_ps(
+            _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+
+        intInputVal1 = _mm256_cvtps_epi32(inputVal1);
+        intInputVal2 = _mm256_cvtps_epi32(inputVal2);
+        intInputVal3 = _mm256_cvtps_epi32(inputVal3);
+        intInputVal4 = _mm256_cvtps_epi32(inputVal4);
+
+        intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+        intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+        intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
+        intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
+
+        intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
+        intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+
+        _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
+        outputVectorPtr += 32;
+    }
+
+    number = thirtysecondPoints * 32;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+    }
  }
  
  #endif /* LV_HAVE_AVX2 */
@@ -350,57 +381,66 @@ volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector, const float* inputVector,
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void
-volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const float* inputVector,
-                                const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector,
+                                                   const float* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
-
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  const float* inputVectorPtr = (const float*)inputVector;
-  int8_t* outputVectorPtr = outputVector;
-
-  float min_val = CHAR_MIN;
-  float max_val = CHAR_MAX;
-  float r;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1, inputVal2, inputVal3, inputVal4;
-  __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
-  __m128 vmin_val = _mm_set_ps1(min_val);
-  __m128 vmax_val = _mm_set_ps1(max_val);
-
-  for(;number < sixteenthPoints; number++){
-    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
-    inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
-    inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
-    inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
-
-    intInputVal1 = _mm_cvtps_epi32(inputVal1);
-    intInputVal2 = _mm_cvtps_epi32(inputVal2);
-    intInputVal3 = _mm_cvtps_epi32(inputVal3);
-    intInputVal4 = _mm_cvtps_epi32(inputVal4);
-
-    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-    intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
-
-    intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
-
-    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 16;
-  }
+    unsigned int number = 0;
+
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int8_t* outputVectorPtr = outputVector;
+
+    float min_val = CHAR_MIN;
+    float max_val = CHAR_MAX;
+    float r;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+    __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+    __m128 vmin_val = _mm_set_ps1(min_val);
+    __m128 vmax_val = _mm_set_ps1(max_val);
+
+    for (; number < sixteenthPoints; number++) {
+        inputVal1 = _mm_load_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+        inputVal2 = _mm_load_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+        inputVal3 = _mm_load_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+        inputVal4 = _mm_load_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        inputVal1 =
+            _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+        inputVal2 =
+            _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+        inputVal3 =
+            _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+        inputVal4 =
+            _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+
+        intInputVal1 = _mm_cvtps_epi32(inputVal1);
+        intInputVal2 = _mm_cvtps_epi32(inputVal2);
+        intInputVal3 = _mm_cvtps_epi32(inputVal3);
+        intInputVal4 = _mm_cvtps_epi32(inputVal4);
+
+        intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+        intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+
+        intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+
+        _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 16;
+    }
  
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    volk_32f_s32f_convert_8i_single(&outputVector[number], r);
-  }
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+    }
  }
  #endif /* LV_HAVE_SSE2 */
  
@@ -408,46 +448,47 @@ volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const float* inputVector,
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const float* inputVector,
-                               const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector,
+                                                  const float* inputVector,
+                                                  const float scalar,
+                                                  unsigned int num_points)
  {
-  unsigned int number = 0;
-  size_t inner_loop;
+    unsigned int number = 0;
+    size_t inner_loop;
  
-  const unsigned int quarterPoints = num_points / 4;
+    const unsigned int quarterPoints = num_points / 4;
  
-  const float* inputVectorPtr = (const float*)inputVector;
+    const float* inputVectorPtr = (const float*)inputVector;
  
-  float min_val = CHAR_MIN;
-  float max_val = CHAR_MAX;
-  float r;
+    float min_val = CHAR_MIN;
+    float max_val = CHAR_MAX;
+    float r;
  
-  int8_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-  __m128 vmin_val = _mm_set_ps1(min_val);
-  __m128 vmax_val = _mm_set_ps1(max_val);
+    int8_t* outputVectorPtr = outputVector;
+    __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 ret;
+    __m128 vmin_val = _mm_set_ps1(min_val);
+    __m128 vmax_val = _mm_set_ps1(max_val);
  
-  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+    __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
  
-  for(;number < quarterPoints; number++){
-    ret = _mm_load_ps(inputVectorPtr);
-    inputVectorPtr += 4;
+    for (; number < quarterPoints; number++) {
+        ret = _mm_load_ps(inputVectorPtr);
+        inputVectorPtr += 4;
  
-    ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+        ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
  
-    _mm_store_ps(outputFloatBuffer, ret);
-    for (inner_loop = 0; inner_loop < 4; inner_loop++){
-      *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
+        _mm_store_ps(outputFloatBuffer, ret);
+        for (inner_loop = 0; inner_loop < 4; inner_loop++) {
+            *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
+        }
      }
-  }
  
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    volk_32f_s32f_convert_8i_single(&outputVector[number], r);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+    }
  }
  
  #endif /* LV_HAVE_SSE */
@@ -455,18 +496,19 @@ volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const float* inputVector,
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, const float* inputVector,
-                                   const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector,
+                                                      const float* inputVector,
+                                                      const float scalar,
+                                                      unsigned int num_points)
  {
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  float r;
-
-  for(number = 0; number < num_points; number++){
-    r = *inputVectorPtr++ * scalar;
-    volk_32f_s32f_convert_8i_single(&outputVector[number], r);
-  }
+    const float* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+    float r;
+
+    for (number = 0; number < num_points; number++) {
+        r = *inputVectorPtr++ * scalar;
+        volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+    }
  }
  
  #endif /* LV_HAVE_GENERIC */
diff --git a/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h b/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h

index 6ace77b7340c3686fb00b17c390a236487df0f95..28d7ab5afedddba088b5a6b2955faadaf057ea68 100644 (file)
--- a/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h
+++ b/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h
@@ -4,42 +4,77 @@
  #include <volk/volk_32f_s32f_s32f_mod_range_32f.h>
  
  #ifdef LV_HAVE_GENERIC
-static inline void volk_32f_s32f_mod_rangepuppet_32f_generic(float *output, const float *input, float bound, unsigned int num_points){
-  volk_32f_s32f_s32f_mod_range_32f_generic(output, input, bound-3.141f, bound, num_points);
+static inline void volk_32f_s32f_mod_rangepuppet_32f_generic(float* output,
+                                                             const float* input,
+                                                             float bound,
+                                                             unsigned int num_points)
+{
+    volk_32f_s32f_s32f_mod_range_32f_generic(
+        output, input, bound - 3.141f, bound, num_points);
  }
  #endif
  
  
  #ifdef LV_HAVE_SSE
-static inline void volk_32f_s32f_mod_rangepuppet_32f_u_sse(float *output, const float *input, float bound, unsigned int num_points){
-  volk_32f_s32f_s32f_mod_range_32f_u_sse(output, input, bound-3.141f, bound, num_points);
+static inline void volk_32f_s32f_mod_rangepuppet_32f_u_sse(float* output,
+                                                           const float* input,
+                                                           float bound,
+                                                           unsigned int num_points)
+{
+    volk_32f_s32f_s32f_mod_range_32f_u_sse(
+        output, input, bound - 3.141f, bound, num_points);
  }
  #endif
  #ifdef LV_HAVE_SSE
-static inline void volk_32f_s32f_mod_rangepuppet_32f_a_sse(float *output, const float *input, float bound, unsigned int num_points){
-  volk_32f_s32f_s32f_mod_range_32f_a_sse(output, input, bound-3.141f, bound, num_points);
+static inline void volk_32f_s32f_mod_rangepuppet_32f_a_sse(float* output,
+                                                           const float* input,
+                                                           float bound,
+                                                           unsigned int num_points)
+{
+    volk_32f_s32f_s32f_mod_range_32f_a_sse(
+        output, input, bound - 3.141f, bound, num_points);
  }
  #endif
  
  #ifdef LV_HAVE_SSE2
-static inline void volk_32f_s32f_mod_rangepuppet_32f_u_sse2(float *output, const float *input, float bound, unsigned int num_points){
-  volk_32f_s32f_s32f_mod_range_32f_u_sse2(output, input, bound-3.141f, bound, num_points);
+static inline void volk_32f_s32f_mod_rangepuppet_32f_u_sse2(float* output,
+                                                            const float* input,
+                                                            float bound,
+                                                            unsigned int num_points)
+{
+    volk_32f_s32f_s32f_mod_range_32f_u_sse2(
+        output, input, bound - 3.141f, bound, num_points);
  }
  #endif
  #ifdef LV_HAVE_SSE2
-static inline void volk_32f_s32f_mod_rangepuppet_32f_a_sse2(float *output, const float *input, float bound, unsigned int num_points){
-  volk_32f_s32f_s32f_mod_range_32f_a_sse2(output, input, bound-3.141f, bound, num_points);
+static inline void volk_32f_s32f_mod_rangepuppet_32f_a_sse2(float* output,
+                                                            const float* input,
+                                                            float bound,
+                                                            unsigned int num_points)
+{
+    volk_32f_s32f_s32f_mod_range_32f_a_sse2(
+        output, input, bound - 3.141f, bound, num_points);
  }
  #endif
  
  #ifdef LV_HAVE_AVX
-static inline void volk_32f_s32f_mod_rangepuppet_32f_u_avx(float *output, const float *input, float bound, unsigned int num_points){
-  volk_32f_s32f_s32f_mod_range_32f_u_avx(output, input, bound-3.141f, bound, num_points);
+static inline void volk_32f_s32f_mod_rangepuppet_32f_u_avx(float* output,
+                                                           const float* input,
+                                                           float bound,
+                                                           unsigned int num_points)
+{
+    volk_32f_s32f_s32f_mod_range_32f_u_avx(
+        output, input, bound - 3.141f, bound, num_points);
  }
  #endif
  #ifdef LV_HAVE_AVX
-static inline void volk_32f_s32f_mod_rangepuppet_32f_a_avx(float *output, const float *input, float bound, unsigned int num_points){
-  volk_32f_s32f_s32f_mod_range_32f_a_avx(output, input, bound-3.141f, bound, num_points);
+static inline void volk_32f_s32f_mod_rangepuppet_32f_a_avx(float* output,
+                                                           const float* input,
+                                                           float bound,
+                                                           unsigned int num_points)
+{
+    volk_32f_s32f_s32f_mod_range_32f_a_avx(
+        output, input, bound - 3.141f, bound, num_points);
  }
  #endif
  #endif
diff --git a/kernels/volk/volk_32f_s32f_multiply_32f.h b/kernels/volk/volk_32f_s32f_multiply_32f.h

index 97c7f696ce0f0a03235c11bea98e7870af5402ed..dcc9c6b709430fe058b946bee3c556bd11f369df 100644 (file)
--- a/kernels/volk/volk_32f_s32f_multiply_32f.h
+++ b/kernels/volk/volk_32f_s32f_multiply_32f.h
@@ -29,8 +29,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_s32f_multiply_32f(float* cVector, const float* aVector, const float scalar, unsigned int num_points)
- * \endcode
+ * void volk_32f_s32f_multiply_32f(float* cVector, const float* aVector, const float
+ * scalar, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li aVector: The input vector of floats.
@@ -75,84 +75,87 @@
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float* aVector,
-                                 const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector,
+                                                    const float* aVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
  
-  __m128 aVal, bVal, cVal;
-  bVal = _mm_set_ps1(scalar);
-  for(;number < quarterPoints; number++){
-    aVal = _mm_loadu_ps(aPtr);
+    __m128 aVal, bVal, cVal;
+    bVal = _mm_set_ps1(scalar);
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_loadu_ps(aPtr);
  
-    cVal = _mm_mul_ps(aVal, bVal);
+        cVal = _mm_mul_ps(aVal, bVal);
  
-    _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+        _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) * scalar;
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * scalar;
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float* aVector,
-                                 const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector,
+                                                    const float* aVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
  
-  __m256 aVal, bVal, cVal;
-  bVal = _mm256_set1_ps(scalar);
-  for(;number < eighthPoints; number++){
+    __m256 aVal, bVal, cVal;
+    bVal = _mm256_set1_ps(scalar);
+    for (; number < eighthPoints; number++) {
  
-    aVal = _mm256_loadu_ps(aPtr);
+        aVal = _mm256_loadu_ps(aPtr);
  
-    cVal = _mm256_mul_ps(aVal, bVal);
+        cVal = _mm256_mul_ps(aVal, bVal);
  
-    _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+        _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        cPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) * scalar;
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * scalar;
+    }
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_s32f_multiply_32f_generic(float* cVector, const float* aVector,
-                                   const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_multiply_32f_generic(float* cVector,
+                                                      const float* aVector,
+                                                      const float scalar,
+                                                      unsigned int num_points)
  {
-  unsigned int number = 0;
-  const float* inputPtr = aVector;
-  float* outputPtr = cVector;
-  for(number = 0; number < num_points; number++){
-    *outputPtr = (*inputPtr) * scalar;
-    inputPtr++;
-    outputPtr++;
-  }
+    unsigned int number = 0;
+    const float* inputPtr = aVector;
+    float* outputPtr = cVector;
+    for (number = 0; number < num_points; number++) {
+        *outputPtr = (*inputPtr) * scalar;
+        inputPtr++;
+        outputPtr++;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -168,126 +171,132 @@ volk_32f_s32f_multiply_32f_generic(float* cVector, const float* aVector,
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32f_s32f_multiply_32f_a_sse(float* cVector, const float* aVector,
-                                 const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector,
+                                                    const float* aVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
  
-  __m128 aVal, bVal, cVal;
-  bVal = _mm_set_ps1(scalar);
-  for(;number < quarterPoints; number++){
-    aVal = _mm_load_ps(aPtr);
+    __m128 aVal, bVal, cVal;
+    bVal = _mm_set_ps1(scalar);
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
  
-    cVal = _mm_mul_ps(aVal, bVal);
+        cVal = _mm_mul_ps(aVal, bVal);
  
-    _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) * scalar;
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * scalar;
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_s32f_multiply_32f_a_avx(float* cVector, const float* aVector,
-                                 const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector,
+                                                    const float* aVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
  
-  __m256 aVal, bVal, cVal;
-  bVal = _mm256_set1_ps(scalar);
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_load_ps(aPtr);
+    __m256 aVal, bVal, cVal;
+    bVal = _mm256_set1_ps(scalar);
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
  
-    cVal = _mm256_mul_ps(aVal, bVal);
+        cVal = _mm256_mul_ps(aVal, bVal);
  
-    _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        cPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) * scalar;
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * scalar;
+    }
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_32f_s32f_multiply_32f_u_neon(float* cVector, const float* aVector,
-                                  const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_multiply_32f_u_neon(float* cVector,
+                                                     const float* aVector,
+                                                     const float scalar,
+                                                     unsigned int num_points)
  {
-  unsigned int number = 0;
-  const float* inputPtr = aVector;
-  float* outputPtr = cVector;
-  const unsigned int quarterPoints = num_points / 4;
-
-  float32x4_t aVal, cVal;
-
-  for(number = 0; number < quarterPoints; number++){
-    aVal = vld1q_f32(inputPtr); // Load into NEON regs
-    cVal = vmulq_n_f32 (aVal, scalar); // Do the multiply
-    vst1q_f32(outputPtr, cVal); // Store results back to output
-    inputPtr += 4;
-    outputPtr += 4;
-  }
-  for(number = quarterPoints * 4; number < num_points; number++){
-    *outputPtr++ = (*inputPtr++) * scalar;
-  }
+    unsigned int number = 0;
+    const float* inputPtr = aVector;
+    float* outputPtr = cVector;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float32x4_t aVal, cVal;
+
+    for (number = 0; number < quarterPoints; number++) {
+        aVal = vld1q_f32(inputPtr);       // Load into NEON regs
+        cVal = vmulq_n_f32(aVal, scalar); // Do the multiply
+        vst1q_f32(outputPtr, cVal);       // Store results back to output
+        inputPtr += 4;
+        outputPtr += 4;
+    }
+    for (number = quarterPoints * 4; number < num_points; number++) {
+        *outputPtr++ = (*inputPtr++) * scalar;
+    }
  }
  #endif /* LV_HAVE_NEON */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_s32f_multiply_32f_a_generic(float* cVector, const float* aVector,
-                                     const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_multiply_32f_a_generic(float* cVector,
+                                                        const float* aVector,
+                                                        const float scalar,
+                                                        unsigned int num_points)
  {
-  unsigned int number = 0;
-  const float* inputPtr = aVector;
-  float* outputPtr = cVector;
-  for(number = 0; number < num_points; number++){
-    *outputPtr = (*inputPtr) * scalar;
-    inputPtr++;
-    outputPtr++;
-  }
+    unsigned int number = 0;
+    const float* inputPtr = aVector;
+    float* outputPtr = cVector;
+    for (number = 0; number < num_points; number++) {
+        *outputPtr = (*inputPtr) * scalar;
+        inputPtr++;
+        outputPtr++;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
  #ifdef LV_HAVE_ORC
  
-extern void
-volk_32f_s32f_multiply_32f_a_orc_impl(float* dst, const float* src,
-                                      const float scalar, unsigned int num_points);
+extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst,
+                                                  const float* src,
+                                                  const float scalar,
+                                                  unsigned int num_points);
  
-static inline void
-volk_32f_s32f_multiply_32f_u_orc(float* cVector, const float* aVector,
-                                 const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector,
+                                                    const float* aVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
  {
-  volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);
+    volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);
  }
  
  #endif /* LV_HAVE_GENERIC */
diff --git a/kernels/volk/volk_32f_s32f_normalize.h b/kernels/volk/volk_32f_s32f_normalize.h

index 404d534a509f87d962046db6e0a5c05037d7b9ce..0a0549230956deaf98735bb47a82ed72f2b9409d 100644 (file)
--- a/kernels/volk/volk_32f_s32f_normalize.h
+++ b/kernels/volk/volk_32f_s32f_normalize.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_s32f_normalize(float* vecBuffer, const float scalar, unsigned int num_points)
- * \endcode
+ * void volk_32f_s32f_normalize(float* vecBuffer, const float scalar, unsigned int
+ * num_points) \endcode
   *
   * \b Inputs
   * \li vecBuffer: The buffer of values to be vectorized.
@@ -76,84 +76,99 @@
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void volk_32f_s32f_normalize_a_avx(float* vecBuffer, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  float* inputPtr = vecBuffer;
+static inline void volk_32f_s32f_normalize_a_avx(float* vecBuffer,
+                                                 const float scalar,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    float* inputPtr = vecBuffer;
  
-  const float invScalar = 1.0 / scalar;
-  __m256 vecScalar = _mm256_set1_ps(invScalar);
+    const float invScalar = 1.0 / scalar;
+    __m256 vecScalar = _mm256_set1_ps(invScalar);
  
-  __m256 input1;
+    __m256 input1;
  
-  const uint64_t eighthPoints = num_points / 8;
-  for(;number < eighthPoints; number++){
+    const uint64_t eighthPoints = num_points / 8;
+    for (; number < eighthPoints; number++) {
  
-    input1 = _mm256_load_ps(inputPtr);
+        input1 = _mm256_load_ps(inputPtr);
  
-    input1 = _mm256_mul_ps(input1, vecScalar);
+        input1 = _mm256_mul_ps(input1, vecScalar);
  
-    _mm256_store_ps(inputPtr, input1);
+        _mm256_store_ps(inputPtr, input1);
  
-    inputPtr += 8;
-  }
+        inputPtr += 8;
+    }
  
-  number = eighthPoints*8;
-  for(; number < num_points; number++){
-    *inputPtr *= invScalar;
-    inputPtr++;
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *inputPtr *= invScalar;
+        inputPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void volk_32f_s32f_normalize_a_sse(float* vecBuffer, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  float* inputPtr = vecBuffer;
+static inline void volk_32f_s32f_normalize_a_sse(float* vecBuffer,
+                                                 const float scalar,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    float* inputPtr = vecBuffer;
  
-  const float invScalar = 1.0 / scalar;
-  __m128 vecScalar = _mm_set_ps1(invScalar);
+    const float invScalar = 1.0 / scalar;
+    __m128 vecScalar = _mm_set_ps1(invScalar);
  
-  __m128 input1;
+    __m128 input1;
  
-  const uint64_t quarterPoints = num_points / 4;
-  for(;number < quarterPoints; number++){
+    const uint64_t quarterPoints = num_points / 4;
+    for (; number < quarterPoints; number++) {
  
-    input1 = _mm_load_ps(inputPtr);
+        input1 = _mm_load_ps(inputPtr);
  
-    input1 = _mm_mul_ps(input1, vecScalar);
+        input1 = _mm_mul_ps(input1, vecScalar);
  
-    _mm_store_ps(inputPtr, input1);
+        _mm_store_ps(inputPtr, input1);
  
-    inputPtr += 4;
-  }
+        inputPtr += 4;
+    }
  
-  number = quarterPoints*4;
-  for(; number < num_points; number++){
-    *inputPtr *= invScalar;
-    inputPtr++;
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *inputPtr *= invScalar;
+        inputPtr++;
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_32f_s32f_normalize_generic(float* vecBuffer, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  float* inputPtr = vecBuffer;
-  const float invScalar = 1.0 / scalar;
-  for(number = 0; number < num_points; number++){
-    *inputPtr *= invScalar;
-    inputPtr++;
-  }
+static inline void volk_32f_s32f_normalize_generic(float* vecBuffer,
+                                                   const float scalar,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+    float* inputPtr = vecBuffer;
+    const float invScalar = 1.0 / scalar;
+    for (number = 0; number < num_points; number++) {
+        *inputPtr *= invScalar;
+        inputPtr++;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  #ifdef LV_HAVE_ORC
  
-extern void volk_32f_s32f_normalize_a_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points);
-static inline void volk_32f_s32f_normalize_u_orc(float* vecBuffer, const float scalar, unsigned int num_points){
+extern void volk_32f_s32f_normalize_a_orc_impl(float* dst,
+                                               float* src,
+                                               const float scalar,
+                                               unsigned int num_points);
+static inline void volk_32f_s32f_normalize_u_orc(float* vecBuffer,
+                                                 const float scalar,
+                                                 unsigned int num_points)
+{
      float invscalar = 1.0 / scalar;
      volk_32f_s32f_normalize_a_orc_impl(vecBuffer, vecBuffer, invscalar, num_points);
  }
@@ -169,32 +184,35 @@ static inline void volk_32f_s32f_normalize_u_orc(float* vecBuffer, const float s
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void volk_32f_s32f_normalize_u_avx(float* vecBuffer, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  float* inputPtr = vecBuffer;
+static inline void volk_32f_s32f_normalize_u_avx(float* vecBuffer,
+                                                 const float scalar,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    float* inputPtr = vecBuffer;
  
-  const float invScalar = 1.0 / scalar;
-  __m256 vecScalar = _mm256_set1_ps(invScalar);
+    const float invScalar = 1.0 / scalar;
+    __m256 vecScalar = _mm256_set1_ps(invScalar);
  
-  __m256 input1;
+    __m256 input1;
  
-  const uint64_t eighthPoints = num_points / 8;
-  for(;number < eighthPoints; number++){
+    const uint64_t eighthPoints = num_points / 8;
+    for (; number < eighthPoints; number++) {
  
-    input1 = _mm256_loadu_ps(inputPtr);
+        input1 = _mm256_loadu_ps(inputPtr);
  
-    input1 = _mm256_mul_ps(input1, vecScalar);
+        input1 = _mm256_mul_ps(input1, vecScalar);
  
-    _mm256_storeu_ps(inputPtr, input1);
+        _mm256_storeu_ps(inputPtr, input1);
  
-    inputPtr += 8;
-  }
+        inputPtr += 8;
+    }
  
-  number = eighthPoints*8;
-  for(; number < num_points; number++){
-    *inputPtr *= invScalar;
-    inputPtr++;
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *inputPtr *= invScalar;
+        inputPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX */
  
diff --git a/kernels/volk/volk_32f_s32f_power_32f.h b/kernels/volk/volk_32f_s32f_power_32f.h

index 070efdcad3ab507dbb4e69ff582eada7135da45b..9b6fdf40fbfeb5e1f8ab34270c890db4b8cbd8e0 100644 (file)
--- a/kernels/volk/volk_32f_s32f_power_32f.h
+++ b/kernels/volk/volk_32f_s32f_power_32f.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_s32f_power_32f(float* cVector, const float* aVector, const float power, unsigned int num_points)
- * \endcode
+ * void volk_32f_s32f_power_32f(float* cVector, const float* aVector, const float power,
+ * unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li aVector: The input vector of floats.
@@ -72,8 +72,8 @@
  #define INCLUDED_volk_32f_s32f_power_32f_a_H
  
  #include <inttypes.h>
-#include <stdio.h>
  #include <math.h>
+#include <stdio.h>
  
  #ifdef LV_HAVE_SSE4_1
  #include <tmmintrin.h>
@@ -82,49 +82,51 @@
  #include <simdmath.h>
  #endif /* LV_HAVE_LIB_SIMDMATH */
  
-static inline void
-volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* aVector,
-                                 const float power, unsigned int num_points)
+static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector,
+                                                    const float* aVector,
+                                                    const float power,
+                                                    unsigned int num_points)
  {
-  unsigned int number = 0;
+    unsigned int number = 0;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
  
  #ifdef LV_HAVE_LIB_SIMDMATH
-  const unsigned int quarterPoints = num_points / 4;
-  __m128 vPower = _mm_set_ps1(power);
-  __m128 zeroValue = _mm_setzero_ps();
-  __m128 signMask;
-  __m128 negatedValues;
-  __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
-  __m128 onesMask = _mm_set_ps1(1);
+    const unsigned int quarterPoints = num_points / 4;
+    __m128 vPower = _mm_set_ps1(power);
+    __m128 zeroValue = _mm_setzero_ps();
+    __m128 signMask;
+    __m128 negatedValues;
+    __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
+    __m128 onesMask = _mm_set_ps1(1);
  
-  __m128 aVal, cVal;
-  for(;number < quarterPoints; number++){
+    __m128 aVal, cVal;
+    for (; number < quarterPoints; number++) {
  
-    aVal = _mm_load_ps(aPtr);
-    signMask = _mm_cmplt_ps(aVal, zeroValue);
-    negatedValues = _mm_sub_ps(zeroValue, aVal);
-    aVal = _mm_blendv_ps(aVal, negatedValues, signMask);
+        aVal = _mm_load_ps(aPtr);
+        signMask = _mm_cmplt_ps(aVal, zeroValue);
+        negatedValues = _mm_sub_ps(zeroValue, aVal);
+        aVal = _mm_blendv_ps(aVal, negatedValues, signMask);
  
-    // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
-    cVal = powf4(aVal, vPower); // Takes each input value to the specified power
+        // powf4 doesn't support negative values in the base, so we mask them off and then
+        // apply the negative after
+        cVal = powf4(aVal, vPower); // Takes each input value to the specified power
  
-    cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal);
+        cVal = _mm_mul_ps(_mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal);
  
-    _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarterPoints * 4;
+    number = quarterPoints * 4;
  #endif /* LV_HAVE_LIB_SIMDMATH */
  
-  for(;number < num_points; number++){
-    *cPtr++ = powf((*aPtr++), power);
-  }
+    for (; number < num_points; number++) {
+        *cPtr++ = powf((*aPtr++), power);
+    }
  }
  
  #endif /* LV_HAVE_SSE4_1 */
@@ -137,49 +139,54 @@ volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* aVector,
  #include <simdmath.h>
  #endif /* LV_HAVE_LIB_SIMDMATH */
  
-static inline void
-volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aVector,
-                              const float power, unsigned int num_points)
+static inline void volk_32f_s32f_power_32f_a_sse(float* cVector,
+                                                 const float* aVector,
+                                                 const float power,
+                                                 unsigned int num_points)
  {
-  unsigned int number = 0;
+    unsigned int number = 0;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
  
  #ifdef LV_HAVE_LIB_SIMDMATH
-  const unsigned int quarterPoints = num_points / 4;
-  __m128 vPower = _mm_set_ps1(power);
-  __m128 zeroValue = _mm_setzero_ps();
-  __m128 signMask;
-  __m128 negatedValues;
-  __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
-  __m128 onesMask = _mm_set_ps1(1);
-
-  __m128 aVal, cVal;
-  for(;number < quarterPoints; number++){
-
-    aVal = _mm_load_ps(aPtr);
-    signMask = _mm_cmplt_ps(aVal, zeroValue);
-    negatedValues = _mm_sub_ps(zeroValue, aVal);
-    aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) );
-
-    // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
-    cVal = powf4(aVal, vPower); // Takes each input value to the specified power
-
-    cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal);
-
-    _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-    aPtr += 4;
-    cPtr += 4;
-  }
-
-  number = quarterPoints * 4;
+    const unsigned int quarterPoints = num_points / 4;
+    __m128 vPower = _mm_set_ps1(power);
+    __m128 zeroValue = _mm_setzero_ps();
+    __m128 signMask;
+    __m128 negatedValues;
+    __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
+    __m128 onesMask = _mm_set_ps1(1);
+
+    __m128 aVal, cVal;
+    for (; number < quarterPoints; number++) {
+
+        aVal = _mm_load_ps(aPtr);
+        signMask = _mm_cmplt_ps(aVal, zeroValue);
+        negatedValues = _mm_sub_ps(zeroValue, aVal);
+        aVal =
+            _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues));
+
+        // powf4 doesn't support negative values in the base, so we mask them off and then
+        // apply the negative after
+        cVal = powf4(aVal, vPower); // Takes each input value to the specified power
+
+        cVal = _mm_mul_ps(_mm_or_ps(_mm_andnot_ps(signMask, onesMask),
+                                    _mm_and_ps(signMask, negativeOneToPower)),
+                          cVal);
+
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
  #endif /* LV_HAVE_LIB_SIMDMATH */
  
-  for(;number < num_points; number++){
-    *cPtr++ = powf((*aPtr++), power);
-  }
+    for (; number < num_points; number++) {
+        *cPtr++ = powf((*aPtr++), power);
+    }
  }
  
  #endif /* LV_HAVE_SSE */
@@ -187,17 +194,18 @@ volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aVector,
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_s32f_power_32f_generic(float* cVector, const float* aVector,
-                                const float power, unsigned int num_points)
+static inline void volk_32f_s32f_power_32f_generic(float* cVector,
+                                                   const float* aVector,
+                                                   const float power,
+                                                   unsigned int num_points)
  {
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
  
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = powf((*aPtr++), power);
-  }
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = powf((*aPtr++), power);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
diff --git a/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h b/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h

index 53b49379d67d1b653c82fd944834209316538dcc..d7f23fe5064e927ee2f8e8917b907a00ba774f77 100644 (file)
--- a/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h
+++ b/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h
@@ -25,8 +25,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_s32f_s32f_mod_range_32f(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
- * \endcode
+ * void volk_32f_s32f_s32f_mod_range_32f(float* outputVector, const float* inputVector,
+ * const float lower_bound, const float upper_bound, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li inputVector: The input vector
@@ -46,117 +46,129 @@
  #ifdef LV_HAVE_AVX
  #include <xmmintrin.h>
  
-static inline void volk_32f_s32f_s32f_mod_range_32f_u_avx(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
-  __m256 lower = _mm256_set1_ps(lower_bound);
-  __m256 upper = _mm256_set1_ps(upper_bound);
-  __m256 distance = _mm256_sub_ps(upper,lower);
-  float dist = upper_bound - lower_bound;
-  __m256 input, output;
-  __m256 is_smaller, is_bigger;
-  __m256 excess, adj;
-
-  const float *inPtr = inputVector;
-  float *outPtr = outputVector;
-  size_t eight_points = num_points / 8;
-  size_t counter;
-  for(counter = 0; counter < eight_points; counter++) {
-    input = _mm256_loadu_ps(inPtr);
-    // calculate mask: input < lower, input > upper
-    is_smaller = _mm256_cmp_ps(input, lower, _CMP_LT_OQ); //0x11: Less than, ordered, non-signalling
-    is_bigger = _mm256_cmp_ps(input, upper, _CMP_GT_OQ); //0x1e: greater than, ordered, non-signalling
-    // find out how far we are out-of-bound – positive values!
-    excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
-    excess = _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
-    // how many do we have to add? (int(excess/distance+1)*distance)
-    excess = _mm256_div_ps(excess, distance);
-    // round down
-    excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
-    // plus 1
-    adj = _mm256_set1_ps(1.0f);
-    excess = _mm256_add_ps(excess, adj);
-    // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
-    adj = _mm256_and_ps(adj, is_smaller);
-    adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
-    // scale by distance, sign
-    excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
-    output = _mm256_add_ps(input, excess);
-    _mm256_storeu_ps(outPtr, output);
-    inPtr += 8;
-    outPtr += 8;
-  }
-
-  size_t cnt;
-  for(cnt = eight_points * 8; cnt < num_points; cnt++){
-    float val = inputVector[cnt];
-    if(val < lower_bound){
-      float excess = lower_bound - val;
-      signed int count = (int)(excess/dist);
-      outputVector[cnt] = val + (count+1)*dist;
+static inline void volk_32f_s32f_s32f_mod_range_32f_u_avx(float* outputVector,
+                                                          const float* inputVector,
+                                                          const float lower_bound,
+                                                          const float upper_bound,
+                                                          unsigned int num_points)
+{
+    __m256 lower = _mm256_set1_ps(lower_bound);
+    __m256 upper = _mm256_set1_ps(upper_bound);
+    __m256 distance = _mm256_sub_ps(upper, lower);
+    float dist = upper_bound - lower_bound;
+    __m256 input, output;
+    __m256 is_smaller, is_bigger;
+    __m256 excess, adj;
+
+    const float* inPtr = inputVector;
+    float* outPtr = outputVector;
+    size_t eight_points = num_points / 8;
+    size_t counter;
+    for (counter = 0; counter < eight_points; counter++) {
+        input = _mm256_loadu_ps(inPtr);
+        // calculate mask: input < lower, input > upper
+        is_smaller = _mm256_cmp_ps(
+            input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling
+        is_bigger = _mm256_cmp_ps(
+            input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling
+        // find out how far we are out-of-bound – positive values!
+        excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
+        excess =
+            _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
+        // how many do we have to add? (int(excess/distance+1)*distance)
+        excess = _mm256_div_ps(excess, distance);
+        // round down
+        excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
+        // plus 1
+        adj = _mm256_set1_ps(1.0f);
+        excess = _mm256_add_ps(excess, adj);
+        // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
+        adj = _mm256_and_ps(adj, is_smaller);
+        adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
+        // scale by distance, sign
+        excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
+        output = _mm256_add_ps(input, excess);
+        _mm256_storeu_ps(outPtr, output);
+        inPtr += 8;
+        outPtr += 8;
      }
-    else if(val > upper_bound){
-      float excess = val - upper_bound;
-      signed int count = (int)(excess/dist);
-      outputVector[cnt] = val - (count+1)*dist;
+
+    size_t cnt;
+    for (cnt = eight_points * 8; cnt < num_points; cnt++) {
+        float val = inputVector[cnt];
+        if (val < lower_bound) {
+            float excess = lower_bound - val;
+            signed int count = (int)(excess / dist);
+            outputVector[cnt] = val + (count + 1) * dist;
+        } else if (val > upper_bound) {
+            float excess = val - upper_bound;
+            signed int count = (int)(excess / dist);
+            outputVector[cnt] = val - (count + 1) * dist;
+        } else
+            outputVector[cnt] = val;
      }
-    else
-      outputVector[cnt] = val;
-  }
  }
-static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
-  __m256 lower = _mm256_set1_ps(lower_bound);
-  __m256 upper = _mm256_set1_ps(upper_bound);
-  __m256 distance = _mm256_sub_ps(upper,lower);
-  float dist = upper_bound - lower_bound;
-  __m256 input, output;
-  __m256 is_smaller, is_bigger;
-  __m256 excess, adj;
-
-  const float *inPtr = inputVector;
-  float *outPtr = outputVector;
-  size_t eight_points = num_points / 8;
-  size_t counter;
-  for(counter = 0; counter < eight_points; counter++) {
-    input = _mm256_load_ps(inPtr);
-    // calculate mask: input < lower, input > upper
-    is_smaller = _mm256_cmp_ps(input, lower, _CMP_LT_OQ); //0x11: Less than, ordered, non-signalling
-    is_bigger = _mm256_cmp_ps(input, upper, _CMP_GT_OQ); //0x1e: greater than, ordered, non-signalling
-    // find out how far we are out-of-bound – positive values!
-    excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
-    excess = _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
-    // how many do we have to add? (int(excess/distance+1)*distance)
-    excess = _mm256_div_ps(excess, distance);
-    // round down
-    excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
-    // plus 1
-    adj = _mm256_set1_ps(1.0f);
-    excess = _mm256_add_ps(excess, adj);
-    // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
-    adj = _mm256_and_ps(adj, is_smaller);
-    adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
-    // scale by distance, sign
-    excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
-    output = _mm256_add_ps(input, excess);
-    _mm256_store_ps(outPtr, output);
-    inPtr += 8;
-    outPtr += 8;
-  }
-
-  size_t cnt;
-  for(cnt = eight_points * 8; cnt < num_points; cnt++){
-    float val = inputVector[cnt];
-    if(val < lower_bound){
-      float excess = lower_bound - val;
-      signed int count = (int)(excess/dist);
-      outputVector[cnt] = val + (count+1)*dist;
+static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector,
+                                                          const float* inputVector,
+                                                          const float lower_bound,
+                                                          const float upper_bound,
+                                                          unsigned int num_points)
+{
+    __m256 lower = _mm256_set1_ps(lower_bound);
+    __m256 upper = _mm256_set1_ps(upper_bound);
+    __m256 distance = _mm256_sub_ps(upper, lower);
+    float dist = upper_bound - lower_bound;
+    __m256 input, output;
+    __m256 is_smaller, is_bigger;
+    __m256 excess, adj;
+
+    const float* inPtr = inputVector;
+    float* outPtr = outputVector;
+    size_t eight_points = num_points / 8;
+    size_t counter;
+    for (counter = 0; counter < eight_points; counter++) {
+        input = _mm256_load_ps(inPtr);
+        // calculate mask: input < lower, input > upper
+        is_smaller = _mm256_cmp_ps(
+            input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling
+        is_bigger = _mm256_cmp_ps(
+            input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling
+        // find out how far we are out-of-bound – positive values!
+        excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
+        excess =
+            _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
+        // how many do we have to add? (int(excess/distance+1)*distance)
+        excess = _mm256_div_ps(excess, distance);
+        // round down
+        excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
+        // plus 1
+        adj = _mm256_set1_ps(1.0f);
+        excess = _mm256_add_ps(excess, adj);
+        // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
+        adj = _mm256_and_ps(adj, is_smaller);
+        adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
+        // scale by distance, sign
+        excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
+        output = _mm256_add_ps(input, excess);
+        _mm256_store_ps(outPtr, output);
+        inPtr += 8;
+        outPtr += 8;
      }
-    else if(val > upper_bound){
-      float excess = val - upper_bound;
-      signed int count = (int)(excess/dist);
-      outputVector[cnt] = val - (count+1)*dist;
+
+    size_t cnt;
+    for (cnt = eight_points * 8; cnt < num_points; cnt++) {
+        float val = inputVector[cnt];
+        if (val < lower_bound) {
+            float excess = lower_bound - val;
+            signed int count = (int)(excess / dist);
+            outputVector[cnt] = val + (count + 1) * dist;
+        } else if (val > upper_bound) {
+            float excess = val - upper_bound;
+            signed int count = (int)(excess / dist);
+            outputVector[cnt] = val - (count + 1) * dist;
+        } else
+            outputVector[cnt] = val;
      }
-    else
-      outputVector[cnt] = val;
-  }
  }
  #endif /* LV_HAVE_AVX */
  
@@ -164,268 +176,282 @@ static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector, c
  #ifdef LV_HAVE_SSE2
  #include <xmmintrin.h>
  
-static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
-  __m128 lower = _mm_set_ps1(lower_bound);
-  __m128 upper = _mm_set_ps1(upper_bound);
-  __m128 distance = _mm_sub_ps(upper,lower);
-  float dist = upper_bound - lower_bound;
-  __m128 input, output;
-  __m128 is_smaller, is_bigger;
-  __m128 excess, adj;
-
-  const float *inPtr = inputVector;
-  float *outPtr = outputVector;
-  size_t quarter_points = num_points / 4;
-  size_t counter;
-  for(counter = 0; counter < quarter_points; counter++) {
-    input = _mm_load_ps(inPtr);
-    // calculate mask: input < lower, input > upper
-    is_smaller = _mm_cmplt_ps(input, lower);
-    is_bigger = _mm_cmpgt_ps(input, upper);
-    // find out how far we are out-of-bound – positive values!
-    excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
-    excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
-    // how many do we have to add? (int(excess/distance+1)*distance)
-    excess = _mm_div_ps(excess, distance);
-    // round down
-    excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
-    // plus 1
-    adj = _mm_set_ps1(1.0f);
-    excess = _mm_add_ps(excess, adj);
-    // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
-    adj = _mm_and_ps(adj, is_smaller);
-    adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
-    // scale by distance, sign
-    excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
-    output = _mm_add_ps(input, excess);
-    _mm_store_ps(outPtr, output);
-    inPtr += 4;
-    outPtr += 4;
-  }
-
-  size_t cnt;
-  for(cnt = quarter_points * 4; cnt < num_points; cnt++){
-    float val = inputVector[cnt];
-    if(val < lower_bound){
-      float excess = lower_bound - val;
-      signed int count = (int)(excess/dist);
-      outputVector[cnt] = val + (count+1)*dist;
+static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float* outputVector,
+                                                           const float* inputVector,
+                                                           const float lower_bound,
+                                                           const float upper_bound,
+                                                           unsigned int num_points)
+{
+    __m128 lower = _mm_set_ps1(lower_bound);
+    __m128 upper = _mm_set_ps1(upper_bound);
+    __m128 distance = _mm_sub_ps(upper, lower);
+    float dist = upper_bound - lower_bound;
+    __m128 input, output;
+    __m128 is_smaller, is_bigger;
+    __m128 excess, adj;
+
+    const float* inPtr = inputVector;
+    float* outPtr = outputVector;
+    size_t quarter_points = num_points / 4;
+    size_t counter;
+    for (counter = 0; counter < quarter_points; counter++) {
+        input = _mm_load_ps(inPtr);
+        // calculate mask: input < lower, input > upper
+        is_smaller = _mm_cmplt_ps(input, lower);
+        is_bigger = _mm_cmpgt_ps(input, upper);
+        // find out how far we are out-of-bound – positive values!
+        excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
+        excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
+        // how many do we have to add? (int(excess/distance+1)*distance)
+        excess = _mm_div_ps(excess, distance);
+        // round down
+        excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
+        // plus 1
+        adj = _mm_set_ps1(1.0f);
+        excess = _mm_add_ps(excess, adj);
+        // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
+        adj = _mm_and_ps(adj, is_smaller);
+        adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
+        // scale by distance, sign
+        excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
+        output = _mm_add_ps(input, excess);
+        _mm_store_ps(outPtr, output);
+        inPtr += 4;
+        outPtr += 4;
      }
-    else if(val > upper_bound){
-      float excess = val - upper_bound;
-      signed int count = (int)(excess/dist);
-      outputVector[cnt] = val - (count+1)*dist;
+
+    size_t cnt;
+    for (cnt = quarter_points * 4; cnt < num_points; cnt++) {
+        float val = inputVector[cnt];
+        if (val < lower_bound) {
+            float excess = lower_bound - val;
+            signed int count = (int)(excess / dist);
+            outputVector[cnt] = val + (count + 1) * dist;
+        } else if (val > upper_bound) {
+            float excess = val - upper_bound;
+            signed int count = (int)(excess / dist);
+            outputVector[cnt] = val - (count + 1) * dist;
+        } else
+            outputVector[cnt] = val;
      }
-    else
-      outputVector[cnt] = val;
-  }
  }
-static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
-  __m128 lower = _mm_set_ps1(lower_bound);
-  __m128 upper = _mm_set_ps1(upper_bound);
-  __m128 distance = _mm_sub_ps(upper,lower);
-  __m128 input, output;
-  __m128 is_smaller, is_bigger;
-  __m128 excess, adj;
-
-  const float *inPtr = inputVector;
-  float *outPtr = outputVector;
-  size_t quarter_points = num_points / 4;
-  size_t counter;
-  for(counter = 0; counter < quarter_points; counter++) {
-    input = _mm_load_ps(inPtr);
-    // calculate mask: input < lower, input > upper
-    is_smaller = _mm_cmplt_ps(input, lower);
-    is_bigger = _mm_cmpgt_ps(input, upper);
-    // find out how far we are out-of-bound – positive values!
-    excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
-    excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
-    // how many do we have to add? (int(excess/distance+1)*distance)
-    excess = _mm_div_ps(excess, distance);
-    // round down – for some reason, SSE doesn't come with a 4x float -> 4x int32 conversion.
-    excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
-    // plus 1
-    adj = _mm_set_ps1(1.0f);
-    excess = _mm_add_ps(excess, adj);
-    // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
-    adj = _mm_and_ps(adj, is_smaller);
-    adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
-    // scale by distance, sign
-    excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
-    output = _mm_add_ps(input, excess);
-    _mm_store_ps(outPtr, output);
-    inPtr += 4;
-    outPtr += 4;
-  }
-
-  float dist = upper_bound - lower_bound;
-  size_t cnt;
-  for(cnt = quarter_points * 4; cnt < num_points; cnt++){
-    float val = inputVector[cnt];
-    if(val < lower_bound){
-      float excess = lower_bound - val;
-      signed int count = (int)(excess/dist);
-      outputVector[cnt] = val + (count+1)*dist;
+static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float* outputVector,
+                                                           const float* inputVector,
+                                                           const float lower_bound,
+                                                           const float upper_bound,
+                                                           unsigned int num_points)
+{
+    __m128 lower = _mm_set_ps1(lower_bound);
+    __m128 upper = _mm_set_ps1(upper_bound);
+    __m128 distance = _mm_sub_ps(upper, lower);
+    __m128 input, output;
+    __m128 is_smaller, is_bigger;
+    __m128 excess, adj;
+
+    const float* inPtr = inputVector;
+    float* outPtr = outputVector;
+    size_t quarter_points = num_points / 4;
+    size_t counter;
+    for (counter = 0; counter < quarter_points; counter++) {
+        input = _mm_load_ps(inPtr);
+        // calculate mask: input < lower, input > upper
+        is_smaller = _mm_cmplt_ps(input, lower);
+        is_bigger = _mm_cmpgt_ps(input, upper);
+        // find out how far we are out-of-bound – positive values!
+        excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
+        excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
+        // how many do we have to add? (int(excess/distance+1)*distance)
+        excess = _mm_div_ps(excess, distance);
+        // round down – for some reason, SSE doesn't come with a 4x float -> 4x int32
+        // conversion.
+        excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
+        // plus 1
+        adj = _mm_set_ps1(1.0f);
+        excess = _mm_add_ps(excess, adj);
+        // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
+        adj = _mm_and_ps(adj, is_smaller);
+        adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
+        // scale by distance, sign
+        excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
+        output = _mm_add_ps(input, excess);
+        _mm_store_ps(outPtr, output);
+        inPtr += 4;
+        outPtr += 4;
      }
-    else if(val > upper_bound){
-      float excess = val - upper_bound;
-      signed int count = (int)(excess/dist);
-      outputVector[cnt] = val - (count+1)*dist;
+
+    float dist = upper_bound - lower_bound;
+    size_t cnt;
+    for (cnt = quarter_points * 4; cnt < num_points; cnt++) {
+        float val = inputVector[cnt];
+        if (val < lower_bound) {
+            float excess = lower_bound - val;
+            signed int count = (int)(excess / dist);
+            outputVector[cnt] = val + (count + 1) * dist;
+        } else if (val > upper_bound) {
+            float excess = val - upper_bound;
+            signed int count = (int)(excess / dist);
+            outputVector[cnt] = val - (count + 1) * dist;
+        } else
+            outputVector[cnt] = val;
      }
-    else
-      outputVector[cnt] = val;
-  }
  }
  #endif /* LV_HAVE_SSE2 */
  
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
-  __m128 lower = _mm_set_ps1(lower_bound);
-  __m128 upper = _mm_set_ps1(upper_bound);
-  __m128 distance = _mm_sub_ps(upper,lower);
-  float dist = upper_bound - lower_bound;
-  __m128 input, output;
-  __m128 is_smaller, is_bigger;
-  __m128 excess, adj;
-  __m128i rounddown;
-
-  const float *inPtr = inputVector;
-  float *outPtr = outputVector;
-  size_t quarter_points = num_points / 4;
-  size_t counter;
-  for(counter = 0; counter < quarter_points; counter++) {
-    input = _mm_load_ps(inPtr);
-    // calculate mask: input < lower, input > upper
-    is_smaller = _mm_cmplt_ps(input, lower);
-    is_bigger = _mm_cmpgt_ps(input, upper);
-    // find out how far we are out-of-bound – positive values!
-    excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
-    excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
-    // how many do we have to add? (int(excess/distance+1)*distance)
-    excess = _mm_div_ps(excess, distance);
-    // round down – for some reason
-    rounddown = _mm_cvttps_epi32(excess);
-    excess = _mm_cvtepi32_ps(rounddown);
-    // plus 1
-    adj = _mm_set_ps1(1.0f);
-    excess = _mm_add_ps(excess, adj);
-    // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
-    adj = _mm_and_ps(adj, is_smaller);
-    adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
-    // scale by distance, sign
-    excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
-    output = _mm_add_ps(input, excess);
-    _mm_store_ps(outPtr, output);
-    inPtr += 4;
-    outPtr += 4;
-  }
-
-  size_t cnt;
-  for(cnt = quarter_points * 4; cnt < num_points; cnt++){
-    float val = inputVector[cnt];
-    if(val < lower_bound){
-      float excess = lower_bound - val;
-      signed int count = (int)(excess/dist);
-      outputVector[cnt] = val + (count+1)*dist;
+static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse(float* outputVector,
+                                                          const float* inputVector,
+                                                          const float lower_bound,
+                                                          const float upper_bound,
+                                                          unsigned int num_points)
+{
+    __m128 lower = _mm_set_ps1(lower_bound);
+    __m128 upper = _mm_set_ps1(upper_bound);
+    __m128 distance = _mm_sub_ps(upper, lower);
+    float dist = upper_bound - lower_bound;
+    __m128 input, output;
+    __m128 is_smaller, is_bigger;
+    __m128 excess, adj;
+    __m128i rounddown;
+
+    const float* inPtr = inputVector;
+    float* outPtr = outputVector;
+    size_t quarter_points = num_points / 4;
+    size_t counter;
+    for (counter = 0; counter < quarter_points; counter++) {
+        input = _mm_load_ps(inPtr);
+        // calculate mask: input < lower, input > upper
+        is_smaller = _mm_cmplt_ps(input, lower);
+        is_bigger = _mm_cmpgt_ps(input, upper);
+        // find out how far we are out-of-bound – positive values!
+        excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
+        excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
+        // how many do we have to add? (int(excess/distance+1)*distance)
+        excess = _mm_div_ps(excess, distance);
+        // round down – for some reason
+        rounddown = _mm_cvttps_epi32(excess);
+        excess = _mm_cvtepi32_ps(rounddown);
+        // plus 1
+        adj = _mm_set_ps1(1.0f);
+        excess = _mm_add_ps(excess, adj);
+        // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
+        adj = _mm_and_ps(adj, is_smaller);
+        adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
+        // scale by distance, sign
+        excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
+        output = _mm_add_ps(input, excess);
+        _mm_store_ps(outPtr, output);
+        inPtr += 4;
+        outPtr += 4;
      }
-    else if(val > upper_bound){
-      float excess = val - upper_bound;
-      signed int count = (int)(excess/dist);
-      outputVector[cnt] = val - (count+1)*dist;
+
+    size_t cnt;
+    for (cnt = quarter_points * 4; cnt < num_points; cnt++) {
+        float val = inputVector[cnt];
+        if (val < lower_bound) {
+            float excess = lower_bound - val;
+            signed int count = (int)(excess / dist);
+            outputVector[cnt] = val + (count + 1) * dist;
+        } else if (val > upper_bound) {
+            float excess = val - upper_bound;
+            signed int count = (int)(excess / dist);
+            outputVector[cnt] = val - (count + 1) * dist;
+        } else
+            outputVector[cnt] = val;
      }
-    else
-      outputVector[cnt] = val;
-  }
  }
-static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
-  __m128 lower = _mm_set_ps1(lower_bound);
-  __m128 upper = _mm_set_ps1(upper_bound);
-  __m128 distance = _mm_sub_ps(upper,lower);
-  __m128 input, output;
-  __m128 is_smaller, is_bigger;
-  __m128 excess, adj;
-  __m128i rounddown;
-
-  const float *inPtr = inputVector;
-  float *outPtr = outputVector;
-  size_t quarter_points = num_points / 4;
-  size_t counter;
-  for(counter = 0; counter < quarter_points; counter++) {
-    input = _mm_load_ps(inPtr);
-    // calculate mask: input < lower, input > upper
-    is_smaller = _mm_cmplt_ps(input, lower);
-    is_bigger = _mm_cmpgt_ps(input, upper);
-    // find out how far we are out-of-bound – positive values!
-    excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
-    excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
-    // how many do we have to add? (int(excess/distance+1)*distance)
-    excess = _mm_div_ps(excess, distance);
-    // round down
-    rounddown = _mm_cvttps_epi32(excess);
-    excess = _mm_cvtepi32_ps(rounddown);
-    // plus 1
-    adj = _mm_set_ps1(1.0f);
-    excess = _mm_add_ps(excess, adj);
-    // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
-    adj = _mm_and_ps(adj, is_smaller);
-    adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
-    // scale by distance, sign
-    excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
-    output = _mm_add_ps(input, excess);
-    _mm_store_ps(outPtr, output);
-    inPtr += 4;
-    outPtr += 4;
-  }
-
-  float dist = upper_bound - lower_bound;
-  size_t cnt;
-  for(cnt = quarter_points * 4; cnt < num_points; cnt++){
-    float val = inputVector[cnt];
-    if(val < lower_bound){
-      float excess = lower_bound - val;
-      signed int count = (int)(excess/dist);
-      outputVector[cnt] = val + (count+1)*dist;
+static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector,
+                                                          const float* inputVector,
+                                                          const float lower_bound,
+                                                          const float upper_bound,
+                                                          unsigned int num_points)
+{
+    __m128 lower = _mm_set_ps1(lower_bound);
+    __m128 upper = _mm_set_ps1(upper_bound);
+    __m128 distance = _mm_sub_ps(upper, lower);
+    __m128 input, output;
+    __m128 is_smaller, is_bigger;
+    __m128 excess, adj;
+    __m128i rounddown;
+
+    const float* inPtr = inputVector;
+    float* outPtr = outputVector;
+    size_t quarter_points = num_points / 4;
+    size_t counter;
+    for (counter = 0; counter < quarter_points; counter++) {
+        input = _mm_load_ps(inPtr);
+        // calculate mask: input < lower, input > upper
+        is_smaller = _mm_cmplt_ps(input, lower);
+        is_bigger = _mm_cmpgt_ps(input, upper);
+        // find out how far we are out-of-bound – positive values!
+        excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
+        excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
+        // how many do we have to add? (int(excess/distance+1)*distance)
+        excess = _mm_div_ps(excess, distance);
+        // round down
+        rounddown = _mm_cvttps_epi32(excess);
+        excess = _mm_cvtepi32_ps(rounddown);
+        // plus 1
+        adj = _mm_set_ps1(1.0f);
+        excess = _mm_add_ps(excess, adj);
+        // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
+        adj = _mm_and_ps(adj, is_smaller);
+        adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
+        // scale by distance, sign
+        excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
+        output = _mm_add_ps(input, excess);
+        _mm_store_ps(outPtr, output);
+        inPtr += 4;
+        outPtr += 4;
      }
-    else if(val > upper_bound){
-      float excess = val - upper_bound;
-      signed int count = (int)(excess/dist);
-      outputVector[cnt] = val - (count+1)*dist;
+
+    float dist = upper_bound - lower_bound;
+    size_t cnt;
+    for (cnt = quarter_points * 4; cnt < num_points; cnt++) {
+        float val = inputVector[cnt];
+        if (val < lower_bound) {
+            float excess = lower_bound - val;
+            signed int count = (int)(excess / dist);
+            outputVector[cnt] = val + (count + 1) * dist;
+        } else if (val > upper_bound) {
+            float excess = val - upper_bound;
+            signed int count = (int)(excess / dist);
+            outputVector[cnt] = val - (count + 1) * dist;
+        } else
+            outputVector[cnt] = val;
      }
-    else
-      outputVector[cnt] = val;
-  }
  }
  #endif /* LV_HAVE_SSE */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_32f_s32f_s32f_mod_range_32f_generic(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
-  float* outPtr = outputVector;
-  const float *inPtr;
-  float distance = upper_bound - lower_bound;
-
-  for(inPtr = inputVector; inPtr < inputVector + num_points; inPtr++){
-    float val = *inPtr;
-    if(val < lower_bound){
-      float excess = lower_bound - val;
-      signed int count = (int)(excess/distance);
-      *outPtr = val + (count+1)*distance;
-    }
-    else if(val > upper_bound){
-      float excess = val - upper_bound;
-      signed int count = (int)(excess/distance);
-      *outPtr = val - (count+1)*distance;
+static inline void volk_32f_s32f_s32f_mod_range_32f_generic(float* outputVector,
+                                                            const float* inputVector,
+                                                            const float lower_bound,
+                                                            const float upper_bound,
+                                                            unsigned int num_points)
+{
+    float* outPtr = outputVector;
+    const float* inPtr;
+    float distance = upper_bound - lower_bound;
+
+    for (inPtr = inputVector; inPtr < inputVector + num_points; inPtr++) {
+        float val = *inPtr;
+        if (val < lower_bound) {
+            float excess = lower_bound - val;
+            signed int count = (int)(excess / distance);
+            *outPtr = val + (count + 1) * distance;
+        } else if (val > upper_bound) {
+            float excess = val - upper_bound;
+            signed int count = (int)(excess / distance);
+            *outPtr = val - (count + 1) * distance;
+        } else
+            *outPtr = val;
+        outPtr++;
      }
-    else
-      *outPtr = val;
-    outPtr++;
-  }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
-
-
  #endif /* INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H */
diff --git a/kernels/volk/volk_32f_s32f_stddev_32f.h b/kernels/volk/volk_32f_s32f_stddev_32f.h

index 4f3dc1c4a77ea30cbabec41f855b98ffb9b55471..0a1c32bf306f9a1fd69b4868e9898cee96c9d336 100644 (file)
--- a/kernels/volk/volk_32f_s32f_stddev_32f.h
+++ b/kernels/volk/volk_32f_s32f_stddev_32f.h
@@ -29,8 +29,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_s32f_stddev_32f(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points)
- * \endcode
+ * void volk_32f_s32f_stddev_32f(float* stddev, const float* inputBuffer, const float
+ * mean, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li inputBuffer: The input vector of floats.
@@ -68,65 +68,72 @@
  #ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H
  #define INCLUDED_volk_32f_s32f_stddev_32f_a_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
-#include <stdio.h>
  #include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_SSE4_1
  #include <smmintrin.h>
  
-static inline void
-volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, const float* inputBuffer,
-                                  const float mean, unsigned int num_points)
+static inline void volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev,
+                                                     const float* inputBuffer,
+                                                     const float mean,
+                                                     unsigned int num_points)
  {
-  float returnValue = 0;
-  if(num_points > 0){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-
-    const float* aPtr = inputBuffer;
-
-    __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
-
-    __m128 squareAccumulator = _mm_setzero_ps();
-    __m128 aVal1, aVal2, aVal3, aVal4;
-    __m128 cVal1, cVal2, cVal3, cVal4;
-    for(;number < sixteenthPoints; number++) {
-      aVal1 = _mm_load_ps(aPtr); aPtr += 4;
-      cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
-
-      aVal2 = _mm_load_ps(aPtr); aPtr += 4;
-      cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
-
-      aVal3 = _mm_load_ps(aPtr); aPtr += 4;
-      cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
-
-      aVal4 = _mm_load_ps(aPtr); aPtr += 4;
-      cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
-
-      cVal1 = _mm_or_ps(cVal1, cVal2);
-      cVal3 = _mm_or_ps(cVal3, cVal4);
-      cVal1 = _mm_or_ps(cVal1, cVal3);
-
-      squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+    float returnValue = 0;
+    if (num_points > 0) {
+        unsigned int number = 0;
+        const unsigned int sixteenthPoints = num_points / 16;
+
+        const float* aPtr = inputBuffer;
+
+        __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+
+        __m128 squareAccumulator = _mm_setzero_ps();
+        __m128 aVal1, aVal2, aVal3, aVal4;
+        __m128 cVal1, cVal2, cVal3, cVal4;
+        for (; number < sixteenthPoints; number++) {
+            aVal1 = _mm_load_ps(aPtr);
+            aPtr += 4;
+            cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
+
+            aVal2 = _mm_load_ps(aPtr);
+            aPtr += 4;
+            cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
+
+            aVal3 = _mm_load_ps(aPtr);
+            aPtr += 4;
+            cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
+
+            aVal4 = _mm_load_ps(aPtr);
+            aPtr += 4;
+            cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
+
+            cVal1 = _mm_or_ps(cVal1, cVal2);
+            cVal3 = _mm_or_ps(cVal3, cVal4);
+            cVal1 = _mm_or_ps(cVal1, cVal3);
+
+            squareAccumulator =
+                _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+        }
+        _mm_store_ps(squareBuffer,
+                     squareAccumulator); // Store the results back into the C container
+        returnValue = squareBuffer[0];
+        returnValue += squareBuffer[1];
+        returnValue += squareBuffer[2];
+        returnValue += squareBuffer[3];
+
+        number = sixteenthPoints * 16;
+        for (; number < num_points; number++) {
+            returnValue += (*aPtr) * (*aPtr);
+            aPtr++;
+        }
+        returnValue /= num_points;
+        returnValue -= (mean * mean);
+        returnValue = sqrtf(returnValue);
      }
-    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
-    returnValue = squareBuffer[0];
-    returnValue += squareBuffer[1];
-    returnValue += squareBuffer[2];
-    returnValue += squareBuffer[3];
-
-    number = sixteenthPoints * 16;
-    for(;number < num_points; number++){
-      returnValue += (*aPtr) * (*aPtr);
-      aPtr++;
-    }
-    returnValue /= num_points;
-    returnValue -= (mean * mean);
-    returnValue = sqrtf(returnValue);
-  }
-  *stddev = returnValue;
+    *stddev = returnValue;
  }
  
  #endif /* LV_HAVE_SSE4_1 */
@@ -134,43 +141,45 @@ volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, const float* inputBuffer,
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32f_s32f_stddev_32f_a_sse(float* stddev, const float* inputBuffer,
-                               const float mean, unsigned int num_points)
+static inline void volk_32f_s32f_stddev_32f_a_sse(float* stddev,
+                                                  const float* inputBuffer,
+                                                  const float mean,
+                                                  unsigned int num_points)
  {
-  float returnValue = 0;
-  if(num_points > 0){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    const float* aPtr = inputBuffer;
-
-    __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
-
-    __m128 squareAccumulator = _mm_setzero_ps();
-    __m128 aVal = _mm_setzero_ps();
-    for(;number < quarterPoints; number++) {
-      aVal = _mm_load_ps(aPtr);                     // aVal = x
-      aVal = _mm_mul_ps(aVal, aVal);                // squareAccumulator += x^2
-      squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
-      aPtr += 4;
-    }
-    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
-    returnValue = squareBuffer[0];
-    returnValue += squareBuffer[1];
-    returnValue += squareBuffer[2];
-    returnValue += squareBuffer[3];
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      returnValue += (*aPtr) * (*aPtr);
-      aPtr++;
+    float returnValue = 0;
+    if (num_points > 0) {
+        unsigned int number = 0;
+        const unsigned int quarterPoints = num_points / 4;
+
+        const float* aPtr = inputBuffer;
+
+        __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+
+        __m128 squareAccumulator = _mm_setzero_ps();
+        __m128 aVal = _mm_setzero_ps();
+        for (; number < quarterPoints; number++) {
+            aVal = _mm_load_ps(aPtr);      // aVal = x
+            aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
+            squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
+            aPtr += 4;
+        }
+        _mm_store_ps(squareBuffer,
+                     squareAccumulator); // Store the results back into the C container
+        returnValue = squareBuffer[0];
+        returnValue += squareBuffer[1];
+        returnValue += squareBuffer[2];
+        returnValue += squareBuffer[3];
+
+        number = quarterPoints * 4;
+        for (; number < num_points; number++) {
+            returnValue += (*aPtr) * (*aPtr);
+            aPtr++;
+        }
+        returnValue /= num_points;
+        returnValue -= (mean * mean);
+        returnValue = sqrtf(returnValue);
      }
-    returnValue /= num_points;
-    returnValue -= (mean * mean);
-    returnValue = sqrtf(returnValue);
-  }
-  *stddev = returnValue;
+    *stddev = returnValue;
  }
  #endif /* LV_HAVE_SSE */
  
@@ -178,86 +187,93 @@ volk_32f_s32f_stddev_32f_a_sse(float* stddev, const float* inputBuffer,
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_s32f_stddev_32f_a_avx(float* stddev, const float* inputBuffer,
-                               const float mean, unsigned int num_points)
+static inline void volk_32f_s32f_stddev_32f_a_avx(float* stddev,
+                                                  const float* inputBuffer,
+                                                  const float mean,
+                                                  unsigned int num_points)
  {
-  float stdDev = 0;
-  if(num_points > 0){
-    unsigned int number = 0;
-    const unsigned int thirtySecondthPoints = num_points / 32;
-
-    const float* aPtr = inputBuffer;
-    __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
-
-    __m256 squareAccumulator = _mm256_setzero_ps();
-    __m256 aVal1, aVal2, aVal3, aVal4;
-    __m256 cVal1, cVal2, cVal3, cVal4;
-    for(;number < thirtySecondthPoints; number++) {
-      aVal1 = _mm256_load_ps(aPtr); aPtr += 8;
-      cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
-
-      aVal2 = _mm256_load_ps(aPtr); aPtr += 8;
-      cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
-
-      aVal3 = _mm256_load_ps(aPtr); aPtr += 8;
-      cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
-
-      aVal4 = _mm256_load_ps(aPtr); aPtr += 8;
-      cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
-
-      cVal1 = _mm256_or_ps(cVal1, cVal2);
-      cVal3 = _mm256_or_ps(cVal3, cVal4);
-      cVal1 = _mm256_or_ps(cVal1, cVal3);
-
-      squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+    float stdDev = 0;
+    if (num_points > 0) {
+        unsigned int number = 0;
+        const unsigned int thirtySecondthPoints = num_points / 32;
+
+        const float* aPtr = inputBuffer;
+        __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
+
+        __m256 squareAccumulator = _mm256_setzero_ps();
+        __m256 aVal1, aVal2, aVal3, aVal4;
+        __m256 cVal1, cVal2, cVal3, cVal4;
+        for (; number < thirtySecondthPoints; number++) {
+            aVal1 = _mm256_load_ps(aPtr);
+            aPtr += 8;
+            cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
+
+            aVal2 = _mm256_load_ps(aPtr);
+            aPtr += 8;
+            cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
+
+            aVal3 = _mm256_load_ps(aPtr);
+            aPtr += 8;
+            cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
+
+            aVal4 = _mm256_load_ps(aPtr);
+            aPtr += 8;
+            cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
+
+            cVal1 = _mm256_or_ps(cVal1, cVal2);
+            cVal3 = _mm256_or_ps(cVal3, cVal4);
+            cVal1 = _mm256_or_ps(cVal1, cVal3);
+
+            squareAccumulator =
+                _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+        }
+        _mm256_store_ps(squareBuffer,
+                        squareAccumulator); // Store the results back into the C container
+        stdDev = squareBuffer[0];
+        stdDev += squareBuffer[1];
+        stdDev += squareBuffer[2];
+        stdDev += squareBuffer[3];
+        stdDev += squareBuffer[4];
+        stdDev += squareBuffer[5];
+        stdDev += squareBuffer[6];
+        stdDev += squareBuffer[7];
+
+        number = thirtySecondthPoints * 32;
+        for (; number < num_points; number++) {
+            stdDev += (*aPtr) * (*aPtr);
+            aPtr++;
+        }
+        stdDev /= num_points;
+        stdDev -= (mean * mean);
+        stdDev = sqrtf(stdDev);
      }
-    _mm256_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
-    stdDev = squareBuffer[0];
-    stdDev += squareBuffer[1];
-    stdDev += squareBuffer[2];
-    stdDev += squareBuffer[3];
-    stdDev += squareBuffer[4];
-    stdDev += squareBuffer[5];
-    stdDev += squareBuffer[6];
-    stdDev += squareBuffer[7];
-
-    number = thirtySecondthPoints * 32;
-    for(;number < num_points; number++){
-      stdDev += (*aPtr) * (*aPtr);
-      aPtr++;
-    }
-    stdDev /= num_points;
-    stdDev -= (mean * mean);
-    stdDev = sqrtf(stdDev);
-  }
-  *stddev = stdDev;
-
+    *stddev = stdDev;
  }
  #endif /* LV_HAVE_AVX */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_s32f_stddev_32f_generic(float* stddev, const float* inputBuffer,
-                                 const float mean, unsigned int num_points)
+static inline void volk_32f_s32f_stddev_32f_generic(float* stddev,
+                                                    const float* inputBuffer,
+                                                    const float mean,
+                                                    unsigned int num_points)
  {
-  float returnValue = 0;
-  if(num_points > 0){
-    const float* aPtr = inputBuffer;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      returnValue += (*aPtr) * (*aPtr);
-      aPtr++;
+    float returnValue = 0;
+    if (num_points > 0) {
+        const float* aPtr = inputBuffer;
+        unsigned int number = 0;
+
+        for (number = 0; number < num_points; number++) {
+            returnValue += (*aPtr) * (*aPtr);
+            aPtr++;
+        }
+
+        returnValue /= num_points;
+        returnValue -= (mean * mean);
+        returnValue = sqrtf(returnValue);
      }
-
-    returnValue /= num_points;
-    returnValue -= (mean * mean);
-    returnValue = sqrtf(returnValue);
-  }
-  *stddev = returnValue;
+    *stddev = returnValue;
  }
  
  #endif /* LV_HAVE_GENERIC */
@@ -268,69 +284,76 @@ volk_32f_s32f_stddev_32f_generic(float* stddev, const float* inputBuffer,
  #ifndef INCLUDED_volk_32f_s32f_stddev_32f_u_H
  #define INCLUDED_volk_32f_s32f_stddev_32f_u_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
-#include <stdio.h>
  #include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_s32f_stddev_32f_u_avx(float* stddev, const float* inputBuffer,
-                               const float mean, unsigned int num_points)
+static inline void volk_32f_s32f_stddev_32f_u_avx(float* stddev,
+                                                  const float* inputBuffer,
+                                                  const float mean,
+                                                  unsigned int num_points)
  {
-  float stdDev = 0;
-  if(num_points > 0){
-    unsigned int number = 0;
-    const unsigned int thirtySecondthPoints = num_points / 32;
-
-    const float* aPtr = inputBuffer;
-    __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
-
-    __m256 squareAccumulator = _mm256_setzero_ps();
-    __m256 aVal1, aVal2, aVal3, aVal4;
-    __m256 cVal1, cVal2, cVal3, cVal4;
-    for(;number < thirtySecondthPoints; number++) {
-      aVal1 = _mm256_loadu_ps(aPtr); aPtr += 8;
-      cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
-
-      aVal2 = _mm256_loadu_ps(aPtr); aPtr += 8;
-      cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
-
-      aVal3 = _mm256_loadu_ps(aPtr); aPtr += 8;
-      cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
-
-      aVal4 = _mm256_loadu_ps(aPtr); aPtr += 8;
-      cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
-
-      cVal1 = _mm256_or_ps(cVal1, cVal2);
-      cVal3 = _mm256_or_ps(cVal3, cVal4);
-      cVal1 = _mm256_or_ps(cVal1, cVal3);
-
-      squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+    float stdDev = 0;
+    if (num_points > 0) {
+        unsigned int number = 0;
+        const unsigned int thirtySecondthPoints = num_points / 32;
+
+        const float* aPtr = inputBuffer;
+        __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
+
+        __m256 squareAccumulator = _mm256_setzero_ps();
+        __m256 aVal1, aVal2, aVal3, aVal4;
+        __m256 cVal1, cVal2, cVal3, cVal4;
+        for (; number < thirtySecondthPoints; number++) {
+            aVal1 = _mm256_loadu_ps(aPtr);
+            aPtr += 8;
+            cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
+
+            aVal2 = _mm256_loadu_ps(aPtr);
+            aPtr += 8;
+            cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
+
+            aVal3 = _mm256_loadu_ps(aPtr);
+            aPtr += 8;
+            cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
+
+            aVal4 = _mm256_loadu_ps(aPtr);
+            aPtr += 8;
+            cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
+
+            cVal1 = _mm256_or_ps(cVal1, cVal2);
+            cVal3 = _mm256_or_ps(cVal3, cVal4);
+            cVal1 = _mm256_or_ps(cVal1, cVal3);
+
+            squareAccumulator =
+                _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+        }
+        _mm256_storeu_ps(
+            squareBuffer,
+            squareAccumulator); // Store the results back into the C container
+        stdDev = squareBuffer[0];
+        stdDev += squareBuffer[1];
+        stdDev += squareBuffer[2];
+        stdDev += squareBuffer[3];
+        stdDev += squareBuffer[4];
+        stdDev += squareBuffer[5];
+        stdDev += squareBuffer[6];
+        stdDev += squareBuffer[7];
+
+        number = thirtySecondthPoints * 32;
+        for (; number < num_points; number++) {
+            stdDev += (*aPtr) * (*aPtr);
+            aPtr++;
+        }
+        stdDev /= num_points;
+        stdDev -= (mean * mean);
+        stdDev = sqrtf(stdDev);
      }
-    _mm256_storeu_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
-    stdDev = squareBuffer[0];
-    stdDev += squareBuffer[1];
-    stdDev += squareBuffer[2];
-    stdDev += squareBuffer[3];
-    stdDev += squareBuffer[4];
-    stdDev += squareBuffer[5];
-    stdDev += squareBuffer[6];
-    stdDev += squareBuffer[7];
-
-    number = thirtySecondthPoints * 32;
-    for(;number < num_points; number++){
-      stdDev += (*aPtr) * (*aPtr);
-      aPtr++;
-    }
-    stdDev /= num_points;
-    stdDev -= (mean * mean);
-    stdDev = sqrtf(stdDev);
-  }
-  *stddev = stdDev;
-
+    *stddev = stdDev;
  }
  #endif /* LV_HAVE_AVX */
  
diff --git a/kernels/volk/volk_32f_sin_32f.h b/kernels/volk/volk_32f_sin_32f.h

index 37800868a8711e8064b4c3b00589a57d8042ef09..e65f25a739d5b88bdfc6403be08f38369f0d905e 100644 (file)
--- a/kernels/volk/volk_32f_sin_32f.h
+++ b/kernels/volk/volk_32f_sin_32f.h
@@ -69,9 +69,9 @@
   * \endcode
   */
  
-#include <stdio.h>
-#include <math.h>
  #include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
  
  #ifndef INCLUDED_volk_32f_sin_32f_a_H
  #define INCLUDED_volk_32f_sin_32f_a_H
@@ -83,72 +83,93 @@
  static inline void
  volk_32f_sin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int eighthPoints = num_points / 8;
-  unsigned int i = 0;
-
-  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
-  __m256 sine, cosine, condition1, condition2;
-  __m256i q, r, ones, twos, fours;
-
-  m4pi = _mm256_set1_ps(1.273239545);
-  pio4A = _mm256_set1_ps(0.78515625);
-  pio4B = _mm256_set1_ps(0.241876e-3);
-  ffours = _mm256_set1_ps(4.0);
-  ftwos = _mm256_set1_ps(2.0);
-  fones = _mm256_set1_ps(1.0);
-  fzeroes = _mm256_setzero_ps();
-  ones = _mm256_set1_epi32(1);
-  twos = _mm256_set1_epi32(2);
-  fours = _mm256_set1_epi32(4);
-
-  cp1 = _mm256_set1_ps(1.0);
-  cp2 = _mm256_set1_ps(0.83333333e-1);
-  cp3 = _mm256_set1_ps(0.2777778e-2);
-  cp4 = _mm256_set1_ps(0.49603e-4);
-  cp5 = _mm256_set1_ps(0.551e-6);
-
-  for(;number < eighthPoints; number++) {
-    aVal = _mm256_load_ps(aPtr);
-    s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
-    q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
-    r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
-
-    s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
-    s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
-
-    s = _mm256_div_ps(s, _mm256_set1_ps(8.0));    // The constant is 2^N, for 3 times argument reduction
-    s = _mm256_mul_ps(s, s);
-    // Evaluate Taylor series
-    s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
-
-    for(i = 0; i < 3; i++) {
-      s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    unsigned int i = 0;
+
+    __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m256 sine, cosine, condition1, condition2;
+    __m256i q, r, ones, twos, fours;
+
+    m4pi = _mm256_set1_ps(1.273239545);
+    pio4A = _mm256_set1_ps(0.78515625);
+    pio4B = _mm256_set1_ps(0.241876e-3);
+    ffours = _mm256_set1_ps(4.0);
+    ftwos = _mm256_set1_ps(2.0);
+    fones = _mm256_set1_ps(1.0);
+    fzeroes = _mm256_setzero_ps();
+    ones = _mm256_set1_epi32(1);
+    twos = _mm256_set1_epi32(2);
+    fours = _mm256_set1_epi32(4);
+
+    cp1 = _mm256_set1_ps(1.0);
+    cp2 = _mm256_set1_ps(0.83333333e-1);
+    cp3 = _mm256_set1_ps(0.2777778e-2);
+    cp4 = _mm256_set1_ps(0.49603e-4);
+    cp5 = _mm256_set1_ps(0.551e-6);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        s = _mm256_sub_ps(aVal,
+                          _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+                                        _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+        q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+        r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+
+        s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
+        s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
+
+        s = _mm256_div_ps(
+            s,
+            _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm256_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm256_mul_ps(
+            _mm256_fmadd_ps(
+                _mm256_fmsub_ps(
+                    _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
+                s,
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+        }
+        s = _mm256_div_ps(s, ftwos);
+
+        sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+        cosine = _mm256_sub_ps(fones, s);
+
+        condition1 = _mm256_cmp_ps(
+            _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
+            fzeroes,
+            _CMP_NEQ_UQ);
+        condition2 = _mm256_cmp_ps(
+            _mm256_cmp_ps(
+                _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
+            _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
+            _CMP_NEQ_UQ);
+        // Need this condition only for cos
+        // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
+        // twos), fours)), fzeroes);
+
+        sine =
+            _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
+        sine = _mm256_sub_ps(
+            sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+        _mm256_store_ps(bPtr, sine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = sin(*aPtr++);
      }
-    s = _mm256_div_ps(s, ftwos);
-
-    sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
-    cosine = _mm256_sub_ps(fones, s);
-
-    condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ);
-    condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ);
-    // Need this condition only for cos
-    //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
-
-    sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
-    sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
-    _mm256_store_ps(bPtr, sine);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++) {
-    *bPtr++ = sin(*aPtr++);
-  }
  }
  
  #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
@@ -159,72 +180,100 @@ volk_32f_sin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int n
  static inline void
  volk_32f_sin_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int eighthPoints = num_points / 8;
-  unsigned int i = 0;
-
-  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
-  __m256 sine, cosine, condition1, condition2;
-  __m256i q, r, ones, twos, fours;
-
-  m4pi = _mm256_set1_ps(1.273239545);
-  pio4A = _mm256_set1_ps(0.78515625);
-  pio4B = _mm256_set1_ps(0.241876e-3);
-  ffours = _mm256_set1_ps(4.0);
-  ftwos = _mm256_set1_ps(2.0);
-  fones = _mm256_set1_ps(1.0);
-  fzeroes = _mm256_setzero_ps();
-  ones = _mm256_set1_epi32(1);
-  twos = _mm256_set1_epi32(2);
-  fours = _mm256_set1_epi32(4);
-
-  cp1 = _mm256_set1_ps(1.0);
-  cp2 = _mm256_set1_ps(0.83333333e-1);
-  cp3 = _mm256_set1_ps(0.2777778e-2);
-  cp4 = _mm256_set1_ps(0.49603e-4);
-  cp5 = _mm256_set1_ps(0.551e-6);
-
-  for(;number < eighthPoints; number++) {
-    aVal = _mm256_load_ps(aPtr);
-    s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
-    q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
-    r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
-
-    s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
-    s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
-
-    s = _mm256_div_ps(s, _mm256_set1_ps(8.0));    // The constant is 2^N, for 3 times argument reduction
-    s = _mm256_mul_ps(s, s);
-    // Evaluate Taylor series
-    s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
-    for(i = 0; i < 3; i++) {
-      s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    unsigned int i = 0;
+
+    __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m256 sine, cosine, condition1, condition2;
+    __m256i q, r, ones, twos, fours;
+
+    m4pi = _mm256_set1_ps(1.273239545);
+    pio4A = _mm256_set1_ps(0.78515625);
+    pio4B = _mm256_set1_ps(0.241876e-3);
+    ffours = _mm256_set1_ps(4.0);
+    ftwos = _mm256_set1_ps(2.0);
+    fones = _mm256_set1_ps(1.0);
+    fzeroes = _mm256_setzero_ps();
+    ones = _mm256_set1_epi32(1);
+    twos = _mm256_set1_epi32(2);
+    fours = _mm256_set1_epi32(4);
+
+    cp1 = _mm256_set1_ps(1.0);
+    cp2 = _mm256_set1_ps(0.83333333e-1);
+    cp3 = _mm256_set1_ps(0.2777778e-2);
+    cp4 = _mm256_set1_ps(0.49603e-4);
+    cp5 = _mm256_set1_ps(0.551e-6);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        s = _mm256_sub_ps(aVal,
+                          _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+                                        _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+        q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+        r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+
+        s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
+        s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
+
+        s = _mm256_div_ps(
+            s,
+            _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm256_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm256_mul_ps(
+            _mm256_add_ps(
+                _mm256_mul_ps(
+                    _mm256_sub_ps(
+                        _mm256_mul_ps(
+                            _mm256_add_ps(
+                                _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
+                                              s),
+                                cp3),
+                            s),
+                        cp2),
+                    s),
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+        }
+        s = _mm256_div_ps(s, ftwos);
+
+        sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+        cosine = _mm256_sub_ps(fones, s);
+
+        condition1 = _mm256_cmp_ps(
+            _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
+            fzeroes,
+            _CMP_NEQ_UQ);
+        condition2 = _mm256_cmp_ps(
+            _mm256_cmp_ps(
+                _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
+            _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
+            _CMP_NEQ_UQ);
+        // Need this condition only for cos
+        // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
+        // twos), fours)), fzeroes);
+
+        sine =
+            _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
+        sine = _mm256_sub_ps(
+            sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+        _mm256_store_ps(bPtr, sine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = sin(*aPtr++);
      }
-    s = _mm256_div_ps(s, ftwos);
-
-    sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
-    cosine = _mm256_sub_ps(fones, s);
-
-    condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ);
-    condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ);
-    // Need this condition only for cos
-    //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
-
-    sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
-    sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
-    _mm256_store_ps(bPtr, sine);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++) {
-    *bPtr++ = sin(*aPtr++);
-  }
  }
  
  #endif /* LV_HAVE_AVX2 for aligned */
@@ -235,72 +284,91 @@ volk_32f_sin_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_p
  static inline void
  volk_32f_sin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int quarterPoints = num_points / 4;
-  unsigned int i = 0;
-
-  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
-  __m128 sine, cosine, condition1, condition2;
-  __m128i q, r, ones, twos, fours;
-
-  m4pi = _mm_set1_ps(1.273239545);
-  pio4A = _mm_set1_ps(0.78515625);
-  pio4B = _mm_set1_ps(0.241876e-3);
-  ffours = _mm_set1_ps(4.0);
-  ftwos = _mm_set1_ps(2.0);
-  fones = _mm_set1_ps(1.0);
-  fzeroes = _mm_setzero_ps();
-  ones = _mm_set1_epi32(1);
-  twos = _mm_set1_epi32(2);
-  fours = _mm_set1_epi32(4);
-
-  cp1 = _mm_set1_ps(1.0);
-  cp2 = _mm_set1_ps(0.83333333e-1);
-  cp3 = _mm_set1_ps(0.2777778e-2);
-  cp4 = _mm_set1_ps(0.49603e-4);
-  cp5 = _mm_set1_ps(0.551e-6);
-
-  for(;number < quarterPoints; number++) {
-    aVal = _mm_load_ps(aPtr);
-    s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
-    q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
-    r = _mm_add_epi32(q, _mm_and_si128(q, ones));
-
-    s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
-    s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
-
-    s = _mm_div_ps(s, _mm_set1_ps(8.0));    // The constant is 2^N, for 3 times argument reduction
-    s = _mm_mul_ps(s, s);
-    // Evaluate Taylor series
-    s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
-    for(i = 0; i < 3; i++) {
-      s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    unsigned int i = 0;
+
+    __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m128 sine, cosine, condition1, condition2;
+    __m128i q, r, ones, twos, fours;
+
+    m4pi = _mm_set1_ps(1.273239545);
+    pio4A = _mm_set1_ps(0.78515625);
+    pio4B = _mm_set1_ps(0.241876e-3);
+    ffours = _mm_set1_ps(4.0);
+    ftwos = _mm_set1_ps(2.0);
+    fones = _mm_set1_ps(1.0);
+    fzeroes = _mm_setzero_ps();
+    ones = _mm_set1_epi32(1);
+    twos = _mm_set1_epi32(2);
+    fours = _mm_set1_epi32(4);
+
+    cp1 = _mm_set1_ps(1.0);
+    cp2 = _mm_set1_ps(0.83333333e-1);
+    cp3 = _mm_set1_ps(0.2777778e-2);
+    cp4 = _mm_set1_ps(0.49603e-4);
+    cp5 = _mm_set1_ps(0.551e-6);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        s = _mm_sub_ps(aVal,
+                       _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+        q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+        r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+        s = _mm_div_ps(
+            s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm_mul_ps(
+            _mm_add_ps(
+                _mm_mul_ps(
+                    _mm_sub_ps(
+                        _mm_mul_ps(
+                            _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
+                                       cp3),
+                            s),
+                        cp2),
+                    s),
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+        }
+        s = _mm_div_ps(s, ftwos);
+
+        sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+        cosine = _mm_sub_ps(fones, s);
+
+        condition1 = _mm_cmpneq_ps(
+            _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+        condition2 = _mm_cmpneq_ps(
+            _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
+            _mm_cmplt_ps(aVal, fzeroes));
+        // Need this condition only for cos
+        // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
+        // twos), fours)), fzeroes);
+
+        sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
+        sine =
+            _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
+        _mm_store_ps(bPtr, sine);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = sinf(*aPtr++);
      }
-    s = _mm_div_ps(s, ftwos);
-
-    sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
-    cosine = _mm_sub_ps(fones, s);
-
-    condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
-    condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
-    // Need this condition only for cos
-    //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
-
-    sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
-    sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
-    _mm_store_ps(bPtr, sine);
-    aPtr += 4;
-    bPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++) {
-    *bPtr++ = sinf(*aPtr++);
-  }
  }
  
  #endif /* LV_HAVE_SSE4_1 for aligned */
@@ -317,72 +385,93 @@ volk_32f_sin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num
  static inline void
  volk_32f_sin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int eighthPoints = num_points / 8;
-  unsigned int i = 0;
-
-  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
-  __m256 sine, cosine, condition1, condition2;
-  __m256i q, r, ones, twos, fours;
-
-  m4pi = _mm256_set1_ps(1.273239545);
-  pio4A = _mm256_set1_ps(0.78515625);
-  pio4B = _mm256_set1_ps(0.241876e-3);
-  ffours = _mm256_set1_ps(4.0);
-  ftwos = _mm256_set1_ps(2.0);
-  fones = _mm256_set1_ps(1.0);
-  fzeroes = _mm256_setzero_ps();
-  ones = _mm256_set1_epi32(1);
-  twos = _mm256_set1_epi32(2);
-  fours = _mm256_set1_epi32(4);
-
-  cp1 = _mm256_set1_ps(1.0);
-  cp2 = _mm256_set1_ps(0.83333333e-1);
-  cp3 = _mm256_set1_ps(0.2777778e-2);
-  cp4 = _mm256_set1_ps(0.49603e-4);
-  cp5 = _mm256_set1_ps(0.551e-6);
-
-  for(;number < eighthPoints; number++) {
-    aVal = _mm256_loadu_ps(aPtr);
-    s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
-    q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
-    r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
-
-    s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
-    s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
-
-    s = _mm256_div_ps(s, _mm256_set1_ps(8.0));    // The constant is 2^N, for 3 times argument reduction
-    s = _mm256_mul_ps(s, s);
-    // Evaluate Taylor series
-    s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
-
-    for(i = 0; i < 3; i++) {
-      s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    unsigned int i = 0;
+
+    __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m256 sine, cosine, condition1, condition2;
+    __m256i q, r, ones, twos, fours;
+
+    m4pi = _mm256_set1_ps(1.273239545);
+    pio4A = _mm256_set1_ps(0.78515625);
+    pio4B = _mm256_set1_ps(0.241876e-3);
+    ffours = _mm256_set1_ps(4.0);
+    ftwos = _mm256_set1_ps(2.0);
+    fones = _mm256_set1_ps(1.0);
+    fzeroes = _mm256_setzero_ps();
+    ones = _mm256_set1_epi32(1);
+    twos = _mm256_set1_epi32(2);
+    fours = _mm256_set1_epi32(4);
+
+    cp1 = _mm256_set1_ps(1.0);
+    cp2 = _mm256_set1_ps(0.83333333e-1);
+    cp3 = _mm256_set1_ps(0.2777778e-2);
+    cp4 = _mm256_set1_ps(0.49603e-4);
+    cp5 = _mm256_set1_ps(0.551e-6);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        s = _mm256_sub_ps(aVal,
+                          _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+                                        _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+        q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+        r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+
+        s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
+        s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
+
+        s = _mm256_div_ps(
+            s,
+            _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm256_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm256_mul_ps(
+            _mm256_fmadd_ps(
+                _mm256_fmsub_ps(
+                    _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
+                s,
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+        }
+        s = _mm256_div_ps(s, ftwos);
+
+        sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+        cosine = _mm256_sub_ps(fones, s);
+
+        condition1 = _mm256_cmp_ps(
+            _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
+            fzeroes,
+            _CMP_NEQ_UQ);
+        condition2 = _mm256_cmp_ps(
+            _mm256_cmp_ps(
+                _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
+            _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
+            _CMP_NEQ_UQ);
+        // Need this condition only for cos
+        // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
+        // twos), fours)), fzeroes);
+
+        sine =
+            _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
+        sine = _mm256_sub_ps(
+            sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+        _mm256_storeu_ps(bPtr, sine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = sin(*aPtr++);
      }
-    s = _mm256_div_ps(s, ftwos);
-
-    sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
-    cosine = _mm256_sub_ps(fones, s);
-
-    condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ);
-    condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ);
-    // Need this condition only for cos
-    //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
-
-    sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
-    sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
-    _mm256_storeu_ps(bPtr, sine);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++) {
-    *bPtr++ = sin(*aPtr++);
-  }
  }
  
  #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
@@ -393,72 +482,100 @@ volk_32f_sin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int n
  static inline void
  volk_32f_sin_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int eighthPoints = num_points / 8;
-  unsigned int i = 0;
-
-  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
-  __m256 sine, cosine, condition1, condition2;
-  __m256i q, r, ones, twos, fours;
-
-  m4pi = _mm256_set1_ps(1.273239545);
-  pio4A = _mm256_set1_ps(0.78515625);
-  pio4B = _mm256_set1_ps(0.241876e-3);
-  ffours = _mm256_set1_ps(4.0);
-  ftwos = _mm256_set1_ps(2.0);
-  fones = _mm256_set1_ps(1.0);
-  fzeroes = _mm256_setzero_ps();
-  ones = _mm256_set1_epi32(1);
-  twos = _mm256_set1_epi32(2);
-  fours = _mm256_set1_epi32(4);
-
-  cp1 = _mm256_set1_ps(1.0);
-  cp2 = _mm256_set1_ps(0.83333333e-1);
-  cp3 = _mm256_set1_ps(0.2777778e-2);
-  cp4 = _mm256_set1_ps(0.49603e-4);
-  cp5 = _mm256_set1_ps(0.551e-6);
-
-  for(;number < eighthPoints; number++) {
-    aVal = _mm256_loadu_ps(aPtr);
-    s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
-    q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
-    r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
-
-    s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
-    s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
-
-    s = _mm256_div_ps(s, _mm256_set1_ps(8.0));    // The constant is 2^N, for 3 times argument reduction
-    s = _mm256_mul_ps(s, s);
-    // Evaluate Taylor series
-    s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
-    for(i = 0; i < 3; i++) {
-      s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    unsigned int i = 0;
+
+    __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m256 sine, cosine, condition1, condition2;
+    __m256i q, r, ones, twos, fours;
+
+    m4pi = _mm256_set1_ps(1.273239545);
+    pio4A = _mm256_set1_ps(0.78515625);
+    pio4B = _mm256_set1_ps(0.241876e-3);
+    ffours = _mm256_set1_ps(4.0);
+    ftwos = _mm256_set1_ps(2.0);
+    fones = _mm256_set1_ps(1.0);
+    fzeroes = _mm256_setzero_ps();
+    ones = _mm256_set1_epi32(1);
+    twos = _mm256_set1_epi32(2);
+    fours = _mm256_set1_epi32(4);
+
+    cp1 = _mm256_set1_ps(1.0);
+    cp2 = _mm256_set1_ps(0.83333333e-1);
+    cp3 = _mm256_set1_ps(0.2777778e-2);
+    cp4 = _mm256_set1_ps(0.49603e-4);
+    cp5 = _mm256_set1_ps(0.551e-6);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        s = _mm256_sub_ps(aVal,
+                          _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+                                        _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+        q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+        r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+
+        s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
+        s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
+
+        s = _mm256_div_ps(
+            s,
+            _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm256_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm256_mul_ps(
+            _mm256_add_ps(
+                _mm256_mul_ps(
+                    _mm256_sub_ps(
+                        _mm256_mul_ps(
+                            _mm256_add_ps(
+                                _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
+                                              s),
+                                cp3),
+                            s),
+                        cp2),
+                    s),
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+        }
+        s = _mm256_div_ps(s, ftwos);
+
+        sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+        cosine = _mm256_sub_ps(fones, s);
+
+        condition1 = _mm256_cmp_ps(
+            _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
+            fzeroes,
+            _CMP_NEQ_UQ);
+        condition2 = _mm256_cmp_ps(
+            _mm256_cmp_ps(
+                _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
+            _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
+            _CMP_NEQ_UQ);
+        // Need this condition only for cos
+        // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
+        // twos), fours)), fzeroes);
+
+        sine =
+            _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
+        sine = _mm256_sub_ps(
+            sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+        _mm256_storeu_ps(bPtr, sine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = sin(*aPtr++);
      }
-    s = _mm256_div_ps(s, ftwos);
-
-    sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
-    cosine = _mm256_sub_ps(fones, s);
-
-    condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ);
-    condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ);
-    // Need this condition only for cos
-    //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
-
-    sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
-    sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
-    _mm256_storeu_ps(bPtr, sine);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++) {
-    *bPtr++ = sin(*aPtr++);
-  }
  }
  
  #endif /* LV_HAVE_AVX2 for unaligned */
@@ -470,70 +587,88 @@ volk_32f_sin_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_p
  static inline void
  volk_32f_sin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int quarterPoints = num_points / 4;
-  unsigned int i = 0;
-
-  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
-  __m128 sine, cosine, condition1, condition2;
-  __m128i q, r, ones, twos, fours;
-
-  m4pi = _mm_set1_ps(1.273239545);
-  pio4A = _mm_set1_ps(0.78515625);
-  pio4B = _mm_set1_ps(0.241876e-3);
-  ffours = _mm_set1_ps(4.0);
-  ftwos = _mm_set1_ps(2.0);
-  fones = _mm_set1_ps(1.0);
-  fzeroes = _mm_setzero_ps();
-  ones = _mm_set1_epi32(1);
-  twos = _mm_set1_epi32(2);
-  fours = _mm_set1_epi32(4);
-
-  cp1 = _mm_set1_ps(1.0);
-  cp2 = _mm_set1_ps(0.83333333e-1);
-  cp3 = _mm_set1_ps(0.2777778e-2);
-  cp4 = _mm_set1_ps(0.49603e-4);
-  cp5 = _mm_set1_ps(0.551e-6);
-
-  for(;number < quarterPoints; number++) {
-    aVal = _mm_loadu_ps(aPtr);
-    s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
-    q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
-    r = _mm_add_epi32(q, _mm_and_si128(q, ones));
-
-    s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
-    s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
-
-    s = _mm_div_ps(s, _mm_set1_ps(8.0));    // The constant is 2^N, for 3 times argument reduction
-    s = _mm_mul_ps(s, s);
-    // Evaluate Taylor series
-    s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
-    for(i = 0; i < 3; i++) {
-      s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
-    }
-    s = _mm_div_ps(s, ftwos);
-
-    sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
-    cosine = _mm_sub_ps(fones, s);
-
-    condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
-    condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
  
-    sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
-    sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
-    _mm_storeu_ps(bPtr, sine);
-    aPtr += 4;
-    bPtr += 4;
-  }
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    unsigned int i = 0;
+
+    __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m128 sine, cosine, condition1, condition2;
+    __m128i q, r, ones, twos, fours;
+
+    m4pi = _mm_set1_ps(1.273239545);
+    pio4A = _mm_set1_ps(0.78515625);
+    pio4B = _mm_set1_ps(0.241876e-3);
+    ffours = _mm_set1_ps(4.0);
+    ftwos = _mm_set1_ps(2.0);
+    fones = _mm_set1_ps(1.0);
+    fzeroes = _mm_setzero_ps();
+    ones = _mm_set1_epi32(1);
+    twos = _mm_set1_epi32(2);
+    fours = _mm_set1_epi32(4);
+
+    cp1 = _mm_set1_ps(1.0);
+    cp2 = _mm_set1_ps(0.83333333e-1);
+    cp3 = _mm_set1_ps(0.2777778e-2);
+    cp4 = _mm_set1_ps(0.49603e-4);
+    cp5 = _mm_set1_ps(0.551e-6);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_loadu_ps(aPtr);
+        s = _mm_sub_ps(aVal,
+                       _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+        q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+        r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+        s = _mm_div_ps(
+            s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm_mul_ps(
+            _mm_add_ps(
+                _mm_mul_ps(
+                    _mm_sub_ps(
+                        _mm_mul_ps(
+                            _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
+                                       cp3),
+                            s),
+                        cp2),
+                    s),
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+        }
+        s = _mm_div_ps(s, ftwos);
+
+        sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+        cosine = _mm_sub_ps(fones, s);
+
+        condition1 = _mm_cmpneq_ps(
+            _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+        condition2 = _mm_cmpneq_ps(
+            _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
+            _mm_cmplt_ps(aVal, fzeroes));
+
+        sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
+        sine =
+            _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
+        _mm_storeu_ps(bPtr, sine);
+        aPtr += 4;
+        bPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *bPtr++ = sinf(*aPtr++);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = sinf(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_SSE4_1 for unaligned */
@@ -544,14 +679,13 @@ volk_32f_sin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num
  static inline void
  volk_32f_sin_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++) {
-    *bPtr++ = sinf(*aPtr++);
-  }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
  
+    for (number = 0; number < num_points; number++) {
+        *bPtr++ = sinf(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_GENERIC */
@@ -562,30 +696,29 @@ volk_32f_sin_32f_generic(float* bVector, const float* aVector, unsigned int num_
  #include <volk/volk_neon_intrinsics.h>
  
  static inline void
-volk_32f_sin_32f_neon(float* bVector, const float* aVector,
-                      unsigned int num_points)
+volk_32f_sin_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
  {
      unsigned int number = 0;
      unsigned int quarter_points = num_points / 4;
      float* bVectorPtr = bVector;
      const float* aVectorPtr = aVector;
-    
+
      float32x4_t b_vec;
      float32x4_t a_vec;
-    
-    for(number = 0; number < quarter_points; number++) {
+
+    for (number = 0; number < quarter_points; number++) {
          a_vec = vld1q_f32(aVectorPtr);
          // Prefetch next one, speeds things up
-        __VOLK_PREFETCH(aVectorPtr+4);
+        __VOLK_PREFETCH(aVectorPtr + 4);
          b_vec = _vsinq_f32(a_vec);
          vst1q_f32(bVectorPtr, b_vec);
          // move pointers ahead
-        bVectorPtr+=4;
-        aVectorPtr+=4;
+        bVectorPtr += 4;
+        aVectorPtr += 4;
      }
-    
+
      // Deal with the rest
-    for(number = quarter_points * 4; number < num_points; number++) {
+    for (number = quarter_points * 4; number < num_points; number++) {
          *bVectorPtr++ = sinf(*aVectorPtr++);
      }
  }
diff --git a/kernels/volk/volk_32f_sqrt_32f.h b/kernels/volk/volk_32f_sqrt_32f.h

index 84160afa8f07d11fe11ee882f3c9f2436f0d651f..667d35668e99615b4f95d559d95b3b1a56cafa43 100644 (file)
--- a/kernels/volk/volk_32f_sqrt_32f.h
+++ b/kernels/volk/volk_32f_sqrt_32f.h
@@ -66,8 +66,8 @@
  #define INCLUDED_volk_32f_sqrt_32f_a_H
  
  #include <inttypes.h>
-#include <stdio.h>
  #include <math.h>
+#include <stdio.h>
  
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
@@ -75,28 +75,28 @@
  static inline void
  volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
  
-  __m128 aVal, cVal;
-  for(;number < quarterPoints; number++) {
-    aVal = _mm_load_ps(aPtr);
+    __m128 aVal, cVal;
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
  
-    cVal = _mm_sqrt_ps(aVal);
+        cVal = _mm_sqrt_ps(aVal);
  
-    _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++) {
-    *cPtr++ = sqrtf(*aPtr++);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = sqrtf(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_SSE */
@@ -107,28 +107,28 @@ volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_p
  static inline void
  volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
  
-  __m256 aVal, cVal;
-  for(;number < eighthPoints; number++) {
-    aVal = _mm256_load_ps(aPtr);
+    __m256 aVal, cVal;
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
  
-    cVal = _mm256_sqrt_ps(aVal);
+        cVal = _mm256_sqrt_ps(aVal);
  
-    _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        cPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(;number < num_points; number++) {
-    *cPtr++ = sqrtf(*aPtr++);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = sqrtf(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX */
@@ -140,24 +140,24 @@ volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_p
  static inline void
  volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
  {
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
-  unsigned int quarter_points = num_points / 4;
-  float32x4_t in_vec, out_vec;
-
-  for(number = 0; number < quarter_points; number++) {
-    in_vec = vld1q_f32(aPtr);
-    // note that armv8 has vsqrt_f32 which will be much better
-    out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec) );
-    vst1q_f32(cPtr, out_vec);
-    aPtr += 4;
-    cPtr += 4;
-  }
-
-  for(number = quarter_points * 4; number < num_points; number++) {
-    *cPtr++ = sqrtf(*aPtr++);
-  }
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+    float32x4_t in_vec, out_vec;
+
+    for (number = 0; number < quarter_points; number++) {
+        in_vec = vld1q_f32(aPtr);
+        // note that armv8 has vsqrt_f32 which will be much better
+        out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec));
+        vst1q_f32(cPtr, out_vec);
+        aPtr += 4;
+        cPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *cPtr++ = sqrtf(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_NEON */
@@ -168,13 +168,13 @@ volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_po
  static inline void
  volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
  {
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
  
-  for(number = 0; number < num_points; number++) {
-    *cPtr++ = sqrtf(*aPtr++);
-  }
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = sqrtf(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_GENERIC */
@@ -182,13 +182,12 @@ volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num
  
  #ifdef LV_HAVE_ORC
  
-extern void
-volk_32f_sqrt_32f_a_orc_impl(float *, const float*, unsigned int);
+extern void volk_32f_sqrt_32f_a_orc_impl(float*, const float*, unsigned int);
  
  static inline void
  volk_32f_sqrt_32f_u_orc(float* cVector, const float* aVector, unsigned int num_points)
  {
-  volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points);
+    volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points);
  }
  
  #endif /* LV_HAVE_ORC */
@@ -199,36 +198,36 @@ volk_32f_sqrt_32f_u_orc(float* cVector, const float* aVector, unsigned int num_p
  #define INCLUDED_volk_32f_sqrt_32f_u_H
  
  #include <inttypes.h>
-#include <stdio.h>
  #include <math.h>
+#include <stdio.h>
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
  static inline void
  volk_32f_sqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
  
-  __m256 aVal, cVal;
-  for(;number < eighthPoints; number++) {
-    aVal = _mm256_loadu_ps(aPtr);
+    __m256 aVal, cVal;
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
  
-    cVal = _mm256_sqrt_ps(aVal);
+        cVal = _mm256_sqrt_ps(aVal);
  
-    _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+        _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        cPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(;number < num_points; number++) {
-    *cPtr++ = sqrtf(*aPtr++);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = sqrtf(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX */
diff --git a/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h b/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h

index 8e996e29e9f4f6591e899df1f7ad52ee954d0316..6ad0f177bf829f75549530a3f375edc4bbac7555 100644 (file)
--- a/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h
+++ b/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h
@@ -29,8 +29,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_stddev_and_mean_32f_x2(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points)
- * \endcode
+ * void volk_32f_stddev_and_mean_32f_x2(float* stddev, float* mean, const float*
+ * inputBuffer, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li inputBuffer: The buffer of points.
@@ -41,10 +41,8 @@
   * \li mean: The mean of the input buffer.
   *
   * \b Example
- * Generate random numbers with c++11's normal distribution and estimate the mean and standard deviation
- * \code
- *   int N = 1000;
- *   unsigned int alignment = volk_get_alignment();
+ * Generate random numbers with c++11's normal distribution and estimate the mean and
+ * standard deviation \code int N = 1000; unsigned int alignment = volk_get_alignment();
   *   float* rand_numbers = (float*)volk_malloc(sizeof(float)*N, alignment);
   *   float* mean = (float*)volk_malloc(sizeof(float), alignment);
   *   float* stddev = (float*)volk_malloc(sizeof(float), alignment);
@@ -71,88 +69,94 @@
  #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
  #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
-#include <stdio.h>
  #include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev, float* mean,
-                                         const float* inputBuffer,
-                                         unsigned int num_points)
+static inline void volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev,
+                                                         float* mean,
+                                                         const float* inputBuffer,
+                                                         unsigned int num_points)
  {
-  float stdDev = 0;
-  float newMean = 0;
-  if(num_points > 0){
-    unsigned int number = 0;
-    const unsigned int thirtySecondthPoints = num_points / 32;
-
-    const float* aPtr = inputBuffer;
-    __VOLK_ATTR_ALIGNED(32) float meanBuffer[8];
-    __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
-
-    __m256 accumulator = _mm256_setzero_ps();
-    __m256 squareAccumulator = _mm256_setzero_ps();
-    __m256 aVal1, aVal2, aVal3, aVal4;
-    __m256 cVal1, cVal2, cVal3, cVal4;
-    for(;number < thirtySecondthPoints; number++) {
-      aVal1 = _mm256_load_ps(aPtr); aPtr += 8;
-      cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
-      accumulator = _mm256_add_ps(accumulator, aVal1);  // accumulator += x
-
-      aVal2 = _mm256_load_ps(aPtr); aPtr += 8;
-      cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
-      accumulator = _mm256_add_ps(accumulator, aVal2);  // accumulator += x
-
-      aVal3 = _mm256_load_ps(aPtr); aPtr += 8;
-      cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
-      accumulator = _mm256_add_ps(accumulator, aVal3);  // accumulator += x
-
-      aVal4 = _mm256_load_ps(aPtr); aPtr += 8;
-      cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
-      accumulator = _mm256_add_ps(accumulator, aVal4);  // accumulator += x
-
-      cVal1 = _mm256_or_ps(cVal1, cVal2);
-      cVal3 = _mm256_or_ps(cVal3, cVal4);
-      cVal1 = _mm256_or_ps(cVal1, cVal3);
-
-      squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
-    }
-    _mm256_store_ps(meanBuffer,accumulator); // Store the results back into the C container
-    _mm256_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
-    newMean = meanBuffer[0];
-    newMean += meanBuffer[1];
-    newMean += meanBuffer[2];
-    newMean += meanBuffer[3];
-    newMean += meanBuffer[4];
-    newMean += meanBuffer[5];
-    newMean += meanBuffer[6];
-    newMean += meanBuffer[7];
-    stdDev = squareBuffer[0];
-    stdDev += squareBuffer[1];
-    stdDev += squareBuffer[2];
-    stdDev += squareBuffer[3];
-    stdDev += squareBuffer[4];
-    stdDev += squareBuffer[5];
-    stdDev += squareBuffer[6];
-    stdDev += squareBuffer[7];
-
-    number = thirtySecondthPoints * 32;
-    for(;number < num_points; number++){
-      stdDev += (*aPtr) * (*aPtr);
-      newMean += *aPtr++;
+    float stdDev = 0;
+    float newMean = 0;
+    if (num_points > 0) {
+        unsigned int number = 0;
+        const unsigned int thirtySecondthPoints = num_points / 32;
+
+        const float* aPtr = inputBuffer;
+        __VOLK_ATTR_ALIGNED(32) float meanBuffer[8];
+        __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
+
+        __m256 accumulator = _mm256_setzero_ps();
+        __m256 squareAccumulator = _mm256_setzero_ps();
+        __m256 aVal1, aVal2, aVal3, aVal4;
+        __m256 cVal1, cVal2, cVal3, cVal4;
+        for (; number < thirtySecondthPoints; number++) {
+            aVal1 = _mm256_load_ps(aPtr);
+            aPtr += 8;
+            cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
+            accumulator = _mm256_add_ps(accumulator, aVal1); // accumulator += x
+
+            aVal2 = _mm256_load_ps(aPtr);
+            aPtr += 8;
+            cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
+            accumulator = _mm256_add_ps(accumulator, aVal2); // accumulator += x
+
+            aVal3 = _mm256_load_ps(aPtr);
+            aPtr += 8;
+            cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
+            accumulator = _mm256_add_ps(accumulator, aVal3); // accumulator += x
+
+            aVal4 = _mm256_load_ps(aPtr);
+            aPtr += 8;
+            cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
+            accumulator = _mm256_add_ps(accumulator, aVal4); // accumulator += x
+
+            cVal1 = _mm256_or_ps(cVal1, cVal2);
+            cVal3 = _mm256_or_ps(cVal3, cVal4);
+            cVal1 = _mm256_or_ps(cVal1, cVal3);
+
+            squareAccumulator =
+                _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+        }
+        _mm256_store_ps(meanBuffer,
+                        accumulator); // Store the results back into the C container
+        _mm256_store_ps(squareBuffer,
+                        squareAccumulator); // Store the results back into the C container
+        newMean = meanBuffer[0];
+        newMean += meanBuffer[1];
+        newMean += meanBuffer[2];
+        newMean += meanBuffer[3];
+        newMean += meanBuffer[4];
+        newMean += meanBuffer[5];
+        newMean += meanBuffer[6];
+        newMean += meanBuffer[7];
+        stdDev = squareBuffer[0];
+        stdDev += squareBuffer[1];
+        stdDev += squareBuffer[2];
+        stdDev += squareBuffer[3];
+        stdDev += squareBuffer[4];
+        stdDev += squareBuffer[5];
+        stdDev += squareBuffer[6];
+        stdDev += squareBuffer[7];
+
+        number = thirtySecondthPoints * 32;
+        for (; number < num_points; number++) {
+            stdDev += (*aPtr) * (*aPtr);
+            newMean += *aPtr++;
+        }
+        newMean /= num_points;
+        stdDev /= num_points;
+        stdDev -= (newMean * newMean);
+        stdDev = sqrtf(stdDev);
      }
-    newMean /= num_points;
-    stdDev /= num_points;
-    stdDev -= (newMean * newMean);
-    stdDev = sqrtf(stdDev);
-  }
-  *stddev = stdDev;
-  *mean = newMean;
-
+    *stddev = stdDev;
+    *mean = newMean;
  }
  #endif /* LV_HAVE_AVX */
  
@@ -160,151 +164,164 @@ volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev, float* mean,
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_stddev_and_mean_32f_x2_u_avx(float* stddev, float* mean,
-                                         const float* inputBuffer,
-                                         unsigned int num_points)
+static inline void volk_32f_stddev_and_mean_32f_x2_u_avx(float* stddev,
+                                                         float* mean,
+                                                         const float* inputBuffer,
+                                                         unsigned int num_points)
  {
-  float stdDev = 0;
-  float newMean = 0;
-  if(num_points > 0){
-    unsigned int number = 0;
-    const unsigned int thirtySecondthPoints = num_points / 32;
-
-    const float* aPtr = inputBuffer;
-    __VOLK_ATTR_ALIGNED(32) float meanBuffer[8];
-    __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
-
-    __m256 accumulator = _mm256_setzero_ps();
-    __m256 squareAccumulator = _mm256_setzero_ps();
-    __m256 aVal1, aVal2, aVal3, aVal4;
-    __m256 cVal1, cVal2, cVal3, cVal4;
-    for(;number < thirtySecondthPoints; number++) {
-      aVal1 = _mm256_loadu_ps(aPtr); aPtr += 8;
-      cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
-      accumulator = _mm256_add_ps(accumulator, aVal1);  // accumulator += x
-
-      aVal2 = _mm256_loadu_ps(aPtr); aPtr += 8;
-      cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
-      accumulator = _mm256_add_ps(accumulator, aVal2);  // accumulator += x
-
-      aVal3 = _mm256_loadu_ps(aPtr); aPtr += 8;
-      cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
-      accumulator = _mm256_add_ps(accumulator, aVal3);  // accumulator += x
-
-      aVal4 = _mm256_loadu_ps(aPtr); aPtr += 8;
-      cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
-      accumulator = _mm256_add_ps(accumulator, aVal4);  // accumulator += x
-
-      cVal1 = _mm256_or_ps(cVal1, cVal2);
-      cVal3 = _mm256_or_ps(cVal3, cVal4);
-      cVal1 = _mm256_or_ps(cVal1, cVal3);
-
-      squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
-    }
-    _mm256_store_ps(meanBuffer,accumulator); // Store the results back into the C container
-    _mm256_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
-    newMean = meanBuffer[0];
-    newMean += meanBuffer[1];
-    newMean += meanBuffer[2];
-    newMean += meanBuffer[3];
-    newMean += meanBuffer[4];
-    newMean += meanBuffer[5];
-    newMean += meanBuffer[6];
-    newMean += meanBuffer[7];
-    stdDev = squareBuffer[0];
-    stdDev += squareBuffer[1];
-    stdDev += squareBuffer[2];
-    stdDev += squareBuffer[3];
-    stdDev += squareBuffer[4];
-    stdDev += squareBuffer[5];
-    stdDev += squareBuffer[6];
-    stdDev += squareBuffer[7];
-
-    number = thirtySecondthPoints * 32;
-    for(;number < num_points; number++){
-      stdDev += (*aPtr) * (*aPtr);
-      newMean += *aPtr++;
+    float stdDev = 0;
+    float newMean = 0;
+    if (num_points > 0) {
+        unsigned int number = 0;
+        const unsigned int thirtySecondthPoints = num_points / 32;
+
+        const float* aPtr = inputBuffer;
+        __VOLK_ATTR_ALIGNED(32) float meanBuffer[8];
+        __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
+
+        __m256 accumulator = _mm256_setzero_ps();
+        __m256 squareAccumulator = _mm256_setzero_ps();
+        __m256 aVal1, aVal2, aVal3, aVal4;
+        __m256 cVal1, cVal2, cVal3, cVal4;
+        for (; number < thirtySecondthPoints; number++) {
+            aVal1 = _mm256_loadu_ps(aPtr);
+            aPtr += 8;
+            cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
+            accumulator = _mm256_add_ps(accumulator, aVal1); // accumulator += x
+
+            aVal2 = _mm256_loadu_ps(aPtr);
+            aPtr += 8;
+            cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
+            accumulator = _mm256_add_ps(accumulator, aVal2); // accumulator += x
+
+            aVal3 = _mm256_loadu_ps(aPtr);
+            aPtr += 8;
+            cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
+            accumulator = _mm256_add_ps(accumulator, aVal3); // accumulator += x
+
+            aVal4 = _mm256_loadu_ps(aPtr);
+            aPtr += 8;
+            cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
+            accumulator = _mm256_add_ps(accumulator, aVal4); // accumulator += x
+
+            cVal1 = _mm256_or_ps(cVal1, cVal2);
+            cVal3 = _mm256_or_ps(cVal3, cVal4);
+            cVal1 = _mm256_or_ps(cVal1, cVal3);
+
+            squareAccumulator =
+                _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+        }
+        _mm256_store_ps(meanBuffer,
+                        accumulator); // Store the results back into the C container
+        _mm256_store_ps(squareBuffer,
+                        squareAccumulator); // Store the results back into the C container
+        newMean = meanBuffer[0];
+        newMean += meanBuffer[1];
+        newMean += meanBuffer[2];
+        newMean += meanBuffer[3];
+        newMean += meanBuffer[4];
+        newMean += meanBuffer[5];
+        newMean += meanBuffer[6];
+        newMean += meanBuffer[7];
+        stdDev = squareBuffer[0];
+        stdDev += squareBuffer[1];
+        stdDev += squareBuffer[2];
+        stdDev += squareBuffer[3];
+        stdDev += squareBuffer[4];
+        stdDev += squareBuffer[5];
+        stdDev += squareBuffer[6];
+        stdDev += squareBuffer[7];
+
+        number = thirtySecondthPoints * 32;
+        for (; number < num_points; number++) {
+            stdDev += (*aPtr) * (*aPtr);
+            newMean += *aPtr++;
+        }
+        newMean /= num_points;
+        stdDev /= num_points;
+        stdDev -= (newMean * newMean);
+        stdDev = sqrtf(stdDev);
      }
-    newMean /= num_points;
-    stdDev /= num_points;
-    stdDev -= (newMean * newMean);
-    stdDev = sqrtf(stdDev);
-  }
-  *stddev = stdDev;
-  *mean = newMean;
-
+    *stddev = stdDev;
+    *mean = newMean;
  }
  #endif /* LV_HAVE_AVX */
  
  
  #ifdef LV_HAVE_SSE4_1
  #include <smmintrin.h>
-static inline void
-volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float* mean,
-                                         const float* inputBuffer,
-                                         unsigned int num_points)
+static inline void volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev,
+                                                            float* mean,
+                                                            const float* inputBuffer,
+                                                            unsigned int num_points)
  {
-  float returnValue = 0;
-  float newMean = 0;
-  if(num_points > 0){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-
-    const float* aPtr = inputBuffer;
-    __VOLK_ATTR_ALIGNED(16) float meanBuffer[4];
-    __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
-
-    __m128 accumulator = _mm_setzero_ps();
-    __m128 squareAccumulator = _mm_setzero_ps();
-    __m128 aVal1, aVal2, aVal3, aVal4;
-    __m128 cVal1, cVal2, cVal3, cVal4;
-    for(;number < sixteenthPoints; number++) {
-      aVal1 = _mm_load_ps(aPtr); aPtr += 4;
-      cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
-      accumulator = _mm_add_ps(accumulator, aVal1);  // accumulator += x
-
-      aVal2 = _mm_load_ps(aPtr); aPtr += 4;
-      cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
-      accumulator = _mm_add_ps(accumulator, aVal2);  // accumulator += x
-
-      aVal3 = _mm_load_ps(aPtr); aPtr += 4;
-      cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
-      accumulator = _mm_add_ps(accumulator, aVal3);  // accumulator += x
-
-      aVal4 = _mm_load_ps(aPtr); aPtr += 4;
-      cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
-      accumulator = _mm_add_ps(accumulator, aVal4);  // accumulator += x
-
-      cVal1 = _mm_or_ps(cVal1, cVal2);
-      cVal3 = _mm_or_ps(cVal3, cVal4);
-      cVal1 = _mm_or_ps(cVal1, cVal3);
-
-      squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
-    }
-    _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
-    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
-    newMean = meanBuffer[0];
-    newMean += meanBuffer[1];
-    newMean += meanBuffer[2];
-    newMean += meanBuffer[3];
-    returnValue = squareBuffer[0];
-    returnValue += squareBuffer[1];
-    returnValue += squareBuffer[2];
-    returnValue += squareBuffer[3];
-
-    number = sixteenthPoints * 16;
-    for(;number < num_points; number++){
-      returnValue += (*aPtr) * (*aPtr);
-      newMean += *aPtr++;
+    float returnValue = 0;
+    float newMean = 0;
+    if (num_points > 0) {
+        unsigned int number = 0;
+        const unsigned int sixteenthPoints = num_points / 16;
+
+        const float* aPtr = inputBuffer;
+        __VOLK_ATTR_ALIGNED(16) float meanBuffer[4];
+        __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+
+        __m128 accumulator = _mm_setzero_ps();
+        __m128 squareAccumulator = _mm_setzero_ps();
+        __m128 aVal1, aVal2, aVal3, aVal4;
+        __m128 cVal1, cVal2, cVal3, cVal4;
+        for (; number < sixteenthPoints; number++) {
+            aVal1 = _mm_load_ps(aPtr);
+            aPtr += 4;
+            cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
+            accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x
+
+            aVal2 = _mm_load_ps(aPtr);
+            aPtr += 4;
+            cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
+            accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x
+
+            aVal3 = _mm_load_ps(aPtr);
+            aPtr += 4;
+            cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
+            accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x
+
+            aVal4 = _mm_load_ps(aPtr);
+            aPtr += 4;
+            cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
+            accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x
+
+            cVal1 = _mm_or_ps(cVal1, cVal2);
+            cVal3 = _mm_or_ps(cVal3, cVal4);
+            cVal1 = _mm_or_ps(cVal1, cVal3);
+
+            squareAccumulator =
+                _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+        }
+        _mm_store_ps(meanBuffer,
+                     accumulator); // Store the results back into the C container
+        _mm_store_ps(squareBuffer,
+                     squareAccumulator); // Store the results back into the C container
+        newMean = meanBuffer[0];
+        newMean += meanBuffer[1];
+        newMean += meanBuffer[2];
+        newMean += meanBuffer[3];
+        returnValue = squareBuffer[0];
+        returnValue += squareBuffer[1];
+        returnValue += squareBuffer[2];
+        returnValue += squareBuffer[3];
+
+        number = sixteenthPoints * 16;
+        for (; number < num_points; number++) {
+            returnValue += (*aPtr) * (*aPtr);
+            newMean += *aPtr++;
+        }
+        newMean /= num_points;
+        returnValue /= num_points;
+        returnValue -= (newMean * newMean);
+        returnValue = sqrtf(returnValue);
      }
-    newMean /= num_points;
-    returnValue /= num_points;
-    returnValue -= (newMean * newMean);
-    returnValue = sqrtf(returnValue);
-  }
-  *stddev = returnValue;
-  *mean = newMean;
+    *stddev = returnValue;
+    *mean = newMean;
  }
  #endif /* LV_HAVE_SSE4_1 */
  
@@ -312,86 +329,86 @@ volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float* mean,
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, float* mean,
-                                      const float* inputBuffer,
-                                      unsigned int num_points)
+static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev,
+                                                         float* mean,
+                                                         const float* inputBuffer,
+                                                         unsigned int num_points)
  {
-  float returnValue = 0;
-  float newMean = 0;
-  if(num_points > 0){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    const float* aPtr = inputBuffer;
-    __VOLK_ATTR_ALIGNED(16) float meanBuffer[4];
-    __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
-
-    __m128 accumulator = _mm_setzero_ps();
-    __m128 squareAccumulator = _mm_setzero_ps();
-    __m128 aVal = _mm_setzero_ps();
-    for(;number < quarterPoints; number++) {
-      aVal = _mm_load_ps(aPtr);                     // aVal = x
-      accumulator = _mm_add_ps(accumulator, aVal);  // accumulator += x
-      aVal = _mm_mul_ps(aVal, aVal);                // squareAccumulator += x^2
-      squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
-      aPtr += 4;
+    float returnValue = 0;
+    float newMean = 0;
+    if (num_points > 0) {
+        unsigned int number = 0;
+        const unsigned int quarterPoints = num_points / 4;
+
+        const float* aPtr = inputBuffer;
+        __VOLK_ATTR_ALIGNED(16) float meanBuffer[4];
+        __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+
+        __m128 accumulator = _mm_setzero_ps();
+        __m128 squareAccumulator = _mm_setzero_ps();
+        __m128 aVal = _mm_setzero_ps();
+        for (; number < quarterPoints; number++) {
+            aVal = _mm_load_ps(aPtr);                    // aVal = x
+            accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x
+            aVal = _mm_mul_ps(aVal, aVal);               // squareAccumulator += x^2
+            squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
+            aPtr += 4;
+        }
+        _mm_store_ps(meanBuffer,
+                     accumulator); // Store the results back into the C container
+        _mm_store_ps(squareBuffer,
+                     squareAccumulator); // Store the results back into the C container
+        newMean = meanBuffer[0];
+        newMean += meanBuffer[1];
+        newMean += meanBuffer[2];
+        newMean += meanBuffer[3];
+        returnValue = squareBuffer[0];
+        returnValue += squareBuffer[1];
+        returnValue += squareBuffer[2];
+        returnValue += squareBuffer[3];
+
+        number = quarterPoints * 4;
+        for (; number < num_points; number++) {
+            returnValue += (*aPtr) * (*aPtr);
+            newMean += *aPtr++;
+        }
+        newMean /= num_points;
+        returnValue /= num_points;
+        returnValue -= (newMean * newMean);
+        returnValue = sqrtf(returnValue);
      }
-    _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
-    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
-    newMean = meanBuffer[0];
-    newMean += meanBuffer[1];
-    newMean += meanBuffer[2];
-    newMean += meanBuffer[3];
-    returnValue = squareBuffer[0];
-    returnValue += squareBuffer[1];
-    returnValue += squareBuffer[2];
-    returnValue += squareBuffer[3];
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      returnValue += (*aPtr) * (*aPtr);
-      newMean += *aPtr++;
-    }
-    newMean /= num_points;
-    returnValue /= num_points;
-    returnValue -= (newMean * newMean);
-    returnValue = sqrtf(returnValue);
-  }
-  *stddev = returnValue;
-  *mean = newMean;
+    *stddev = returnValue;
+    *mean = newMean;
  }
  #endif /* LV_HAVE_SSE */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_stddev_and_mean_32f_x2_generic(float* stddev, float* mean,
-                                        const float* inputBuffer,
-                                        unsigned int num_points)
+static inline void volk_32f_stddev_and_mean_32f_x2_generic(float* stddev,
+                                                           float* mean,
+                                                           const float* inputBuffer,
+                                                           unsigned int num_points)
  {
-  float returnValue = 0;
-  float newMean = 0;
-  if(num_points > 0){
-    const float* aPtr = inputBuffer;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      returnValue += (*aPtr) * (*aPtr);
-      newMean += *aPtr++;
+    float returnValue = 0;
+    float newMean = 0;
+    if (num_points > 0) {
+        const float* aPtr = inputBuffer;
+        unsigned int number = 0;
+
+        for (number = 0; number < num_points; number++) {
+            returnValue += (*aPtr) * (*aPtr);
+            newMean += *aPtr++;
+        }
+        newMean /= num_points;
+        returnValue /= num_points;
+        returnValue -= (newMean * newMean);
+        returnValue = sqrtf(returnValue);
      }
-    newMean /= num_points;
-    returnValue /= num_points;
-    returnValue -= (newMean * newMean);
-    returnValue = sqrtf(returnValue);
-  }
-  *stddev = returnValue;
-  *mean = newMean;
+    *stddev = returnValue;
+    *mean = newMean;
  }
  #endif /* LV_HAVE_GENERIC */
  
  
-
-
  #endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */
diff --git a/kernels/volk/volk_32f_tan_32f.h b/kernels/volk/volk_32f_tan_32f.h

index 239b7459b2bbfd252d2a4d4e5e3667146216fc46..a623a6693eab97e2cac5ead29ea5cbec8fe2ce30 100644 (file)
--- a/kernels/volk/volk_32f_tan_32f.h
+++ b/kernels/volk/volk_32f_tan_32f.h
@@ -71,9 +71,9 @@
   * \endcode
   */
  
-#include <stdio.h>
-#include <math.h>
  #include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
  
  #ifndef INCLUDED_volk_32f_tan_32f_a_H
  #define INCLUDED_volk_32f_tan_32f_a_H
@@ -82,78 +82,102 @@
  #include <immintrin.h>
  
  static inline void
-volk_32f_tan_32f_a_avx2_fma(float* bVector, const float* aVector,
-                          unsigned int num_points)
+volk_32f_tan_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int eighthPoints = num_points / 8;
-  unsigned int i = 0;
-
-  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
-  __m256 sine, cosine, tangent, condition1, condition2, condition3;
-  __m256i q, r, ones, twos, fours;
-
-  m4pi = _mm256_set1_ps(1.273239545);
-  pio4A = _mm256_set1_ps(0.78515625);
-  pio4B = _mm256_set1_ps(0.241876e-3);
-  ffours = _mm256_set1_ps(4.0);
-  ftwos = _mm256_set1_ps(2.0);
-  fones = _mm256_set1_ps(1.0);
-  fzeroes = _mm256_setzero_ps();
-  ones = _mm256_set1_epi32(1);
-  twos = _mm256_set1_epi32(2);
-  fours = _mm256_set1_epi32(4);
-
-  cp1 = _mm256_set1_ps(1.0);
-  cp2 = _mm256_set1_ps(0.83333333e-1);
-  cp3 = _mm256_set1_ps(0.2777778e-2);
-  cp4 = _mm256_set1_ps(0.49603e-4);
-  cp5 = _mm256_set1_ps(0.551e-6);
-
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_load_ps(aPtr);
-    s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
-    q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
-    r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
-
-    s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
-    s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
-
-    s = _mm256_div_ps(s, _mm256_set1_ps(8.0));    // The constant is 2^N, for 3 times argument reduction
-    s = _mm256_mul_ps(s, s);
-    // Evaluate Taylor series
-    s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
-
-    for(i = 0; i < 3; i++){
-      s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    unsigned int i = 0;
+
+    __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m256 sine, cosine, tangent, condition1, condition2, condition3;
+    __m256i q, r, ones, twos, fours;
+
+    m4pi = _mm256_set1_ps(1.273239545);
+    pio4A = _mm256_set1_ps(0.78515625);
+    pio4B = _mm256_set1_ps(0.241876e-3);
+    ffours = _mm256_set1_ps(4.0);
+    ftwos = _mm256_set1_ps(2.0);
+    fones = _mm256_set1_ps(1.0);
+    fzeroes = _mm256_setzero_ps();
+    ones = _mm256_set1_epi32(1);
+    twos = _mm256_set1_epi32(2);
+    fours = _mm256_set1_epi32(4);
+
+    cp1 = _mm256_set1_ps(1.0);
+    cp2 = _mm256_set1_ps(0.83333333e-1);
+    cp3 = _mm256_set1_ps(0.2777778e-2);
+    cp4 = _mm256_set1_ps(0.49603e-4);
+    cp5 = _mm256_set1_ps(0.551e-6);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        s = _mm256_sub_ps(aVal,
+                          _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+                                        _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+        q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+        r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+
+        s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
+        s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
+
+        s = _mm256_div_ps(
+            s,
+            _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm256_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm256_mul_ps(
+            _mm256_fmadd_ps(
+                _mm256_fmsub_ps(
+                    _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
+                s,
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+        }
+        s = _mm256_div_ps(s, ftwos);
+
+        sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+        cosine = _mm256_sub_ps(fones, s);
+
+        condition1 = _mm256_cmp_ps(
+            _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
+            fzeroes,
+            _CMP_NEQ_UQ);
+        condition2 = _mm256_cmp_ps(
+            _mm256_cmp_ps(
+                _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
+            _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
+            _CMP_NEQ_UQ);
+        condition3 = _mm256_cmp_ps(
+            _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
+            fzeroes,
+            _CMP_NEQ_UQ);
+
+        __m256 temp = cosine;
+        cosine =
+            _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
+        sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
+        sine = _mm256_sub_ps(
+            sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+        cosine = _mm256_sub_ps(
+            cosine,
+            _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
+        tangent = _mm256_div_ps(sine, cosine);
+        _mm256_store_ps(bPtr, tangent);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = tan(*aPtr++);
      }
-    s = _mm256_div_ps(s, ftwos);
-
-    sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
-    cosine = _mm256_sub_ps(fones, s);
-
-    condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ);
-    condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ);
-    condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes, _CMP_NEQ_UQ);
-
-    __m256 temp = cosine;
-    cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
-    sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
-    sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
-    cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
-    tangent = _mm256_div_ps(sine, cosine);
-    _mm256_store_ps(bPtr, tangent);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *bPtr++ = tan(*aPtr++);
-  }
  }
  
  #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
@@ -162,78 +186,109 @@ volk_32f_tan_32f_a_avx2_fma(float* bVector, const float* aVector,
  #include <immintrin.h>
  
  static inline void
-volk_32f_tan_32f_a_avx2(float* bVector, const float* aVector,
-                          unsigned int num_points)
+volk_32f_tan_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int eighthPoints = num_points / 8;
-  unsigned int i = 0;
-
-  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
-  __m256 sine, cosine, tangent, condition1, condition2, condition3;
-  __m256i q, r, ones, twos, fours;
-
-  m4pi = _mm256_set1_ps(1.273239545);
-  pio4A = _mm256_set1_ps(0.78515625);
-  pio4B = _mm256_set1_ps(0.241876e-3);
-  ffours = _mm256_set1_ps(4.0);
-  ftwos = _mm256_set1_ps(2.0);
-  fones = _mm256_set1_ps(1.0);
-  fzeroes = _mm256_setzero_ps();
-  ones = _mm256_set1_epi32(1);
-  twos = _mm256_set1_epi32(2);
-  fours = _mm256_set1_epi32(4);
-
-  cp1 = _mm256_set1_ps(1.0);
-  cp2 = _mm256_set1_ps(0.83333333e-1);
-  cp3 = _mm256_set1_ps(0.2777778e-2);
-  cp4 = _mm256_set1_ps(0.49603e-4);
-  cp5 = _mm256_set1_ps(0.551e-6);
-
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_load_ps(aPtr);
-    s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
-    q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
-    r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
-
-    s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
-    s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
-
-    s = _mm256_div_ps(s, _mm256_set1_ps(8.0));    // The constant is 2^N, for 3 times argument reduction
-    s = _mm256_mul_ps(s, s);
-    // Evaluate Taylor series
-    s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
-    for(i = 0; i < 3; i++){
-      s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    unsigned int i = 0;
+
+    __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m256 sine, cosine, tangent, condition1, condition2, condition3;
+    __m256i q, r, ones, twos, fours;
+
+    m4pi = _mm256_set1_ps(1.273239545);
+    pio4A = _mm256_set1_ps(0.78515625);
+    pio4B = _mm256_set1_ps(0.241876e-3);
+    ffours = _mm256_set1_ps(4.0);
+    ftwos = _mm256_set1_ps(2.0);
+    fones = _mm256_set1_ps(1.0);
+    fzeroes = _mm256_setzero_ps();
+    ones = _mm256_set1_epi32(1);
+    twos = _mm256_set1_epi32(2);
+    fours = _mm256_set1_epi32(4);
+
+    cp1 = _mm256_set1_ps(1.0);
+    cp2 = _mm256_set1_ps(0.83333333e-1);
+    cp3 = _mm256_set1_ps(0.2777778e-2);
+    cp4 = _mm256_set1_ps(0.49603e-4);
+    cp5 = _mm256_set1_ps(0.551e-6);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        s = _mm256_sub_ps(aVal,
+                          _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+                                        _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+        q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+        r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+
+        s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
+        s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
+
+        s = _mm256_div_ps(
+            s,
+            _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm256_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm256_mul_ps(
+            _mm256_add_ps(
+                _mm256_mul_ps(
+                    _mm256_sub_ps(
+                        _mm256_mul_ps(
+                            _mm256_add_ps(
+                                _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
+                                              s),
+                                cp3),
+                            s),
+                        cp2),
+                    s),
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+        }
+        s = _mm256_div_ps(s, ftwos);
+
+        sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+        cosine = _mm256_sub_ps(fones, s);
+
+        condition1 = _mm256_cmp_ps(
+            _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
+            fzeroes,
+            _CMP_NEQ_UQ);
+        condition2 = _mm256_cmp_ps(
+            _mm256_cmp_ps(
+                _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
+            _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
+            _CMP_NEQ_UQ);
+        condition3 = _mm256_cmp_ps(
+            _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
+            fzeroes,
+            _CMP_NEQ_UQ);
+
+        __m256 temp = cosine;
+        cosine =
+            _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
+        sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
+        sine = _mm256_sub_ps(
+            sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+        cosine = _mm256_sub_ps(
+            cosine,
+            _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
+        tangent = _mm256_div_ps(sine, cosine);
+        _mm256_store_ps(bPtr, tangent);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = tan(*aPtr++);
      }
-    s = _mm256_div_ps(s, ftwos);
-
-    sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
-    cosine = _mm256_sub_ps(fones, s);
-
-    condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ);
-    condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ);
-    condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes, _CMP_NEQ_UQ);
-
-    __m256 temp = cosine;
-    cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
-    sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
-    sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
-    cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
-    tangent = _mm256_div_ps(sine, cosine);
-    _mm256_store_ps(bPtr, tangent);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *bPtr++ = tan(*aPtr++);
-  }
  }
  
  #endif /* LV_HAVE_AVX2 for aligned */
@@ -242,78 +297,97 @@ volk_32f_tan_32f_a_avx2(float* bVector, const float* aVector,
  #include <smmintrin.h>
  
  static inline void
-volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector,
-                          unsigned int num_points)
+volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int quarterPoints = num_points / 4;
-  unsigned int i = 0;
-
-  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
-  __m128 sine, cosine, tangent, condition1, condition2, condition3;
-  __m128i q, r, ones, twos, fours;
-
-  m4pi = _mm_set1_ps(1.273239545);
-  pio4A = _mm_set1_ps(0.78515625);
-  pio4B = _mm_set1_ps(0.241876e-3);
-  ffours = _mm_set1_ps(4.0);
-  ftwos = _mm_set1_ps(2.0);
-  fones = _mm_set1_ps(1.0);
-  fzeroes = _mm_setzero_ps();
-  ones = _mm_set1_epi32(1);
-  twos = _mm_set1_epi32(2);
-  fours = _mm_set1_epi32(4);
-
-  cp1 = _mm_set1_ps(1.0);
-  cp2 = _mm_set1_ps(0.83333333e-1);
-  cp3 = _mm_set1_ps(0.2777778e-2);
-  cp4 = _mm_set1_ps(0.49603e-4);
-  cp5 = _mm_set1_ps(0.551e-6);
-
-  for(;number < quarterPoints; number++){
-    aVal = _mm_load_ps(aPtr);
-    s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
-    q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
-    r = _mm_add_epi32(q, _mm_and_si128(q, ones));
-
-    s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
-    s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
-
-    s = _mm_div_ps(s, _mm_set1_ps(8.0));    // The constant is 2^N, for 3 times argument reduction
-    s = _mm_mul_ps(s, s);
-    // Evaluate Taylor series
-    s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
-    for(i = 0; i < 3; i++){
-      s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    unsigned int i = 0;
+
+    __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m128 sine, cosine, tangent, condition1, condition2, condition3;
+    __m128i q, r, ones, twos, fours;
+
+    m4pi = _mm_set1_ps(1.273239545);
+    pio4A = _mm_set1_ps(0.78515625);
+    pio4B = _mm_set1_ps(0.241876e-3);
+    ffours = _mm_set1_ps(4.0);
+    ftwos = _mm_set1_ps(2.0);
+    fones = _mm_set1_ps(1.0);
+    fzeroes = _mm_setzero_ps();
+    ones = _mm_set1_epi32(1);
+    twos = _mm_set1_epi32(2);
+    fours = _mm_set1_epi32(4);
+
+    cp1 = _mm_set1_ps(1.0);
+    cp2 = _mm_set1_ps(0.83333333e-1);
+    cp3 = _mm_set1_ps(0.2777778e-2);
+    cp4 = _mm_set1_ps(0.49603e-4);
+    cp5 = _mm_set1_ps(0.551e-6);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        s = _mm_sub_ps(aVal,
+                       _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+        q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+        r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+        s = _mm_div_ps(
+            s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm_mul_ps(
+            _mm_add_ps(
+                _mm_mul_ps(
+                    _mm_sub_ps(
+                        _mm_mul_ps(
+                            _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
+                                       cp3),
+                            s),
+                        cp2),
+                    s),
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+        }
+        s = _mm_div_ps(s, ftwos);
+
+        sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+        cosine = _mm_sub_ps(fones, s);
+
+        condition1 = _mm_cmpneq_ps(
+            _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+        condition2 = _mm_cmpneq_ps(
+            _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
+            _mm_cmplt_ps(aVal, fzeroes));
+        condition3 = _mm_cmpneq_ps(
+            _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
+
+        __m128 temp = cosine;
+        cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
+        sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
+        sine =
+            _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
+        cosine = _mm_sub_ps(
+            cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
+        tangent = _mm_div_ps(sine, cosine);
+        _mm_store_ps(bPtr, tangent);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = tanf(*aPtr++);
      }
-    s = _mm_div_ps(s, ftwos);
-
-    sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
-    cosine = _mm_sub_ps(fones, s);
-
-    condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
-    condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
-    condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
-
-    __m128 temp = cosine;
-    cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
-    sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
-    sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
-    cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
-    tangent = _mm_div_ps(sine, cosine);
-    _mm_store_ps(bPtr, tangent);
-    aPtr += 4;
-    bPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *bPtr++ = tanf(*aPtr++);
-  }
  }
  
  #endif /* LV_HAVE_SSE4_1 for aligned */
@@ -328,78 +402,102 @@ volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector,
  #include <immintrin.h>
  
  static inline void
-volk_32f_tan_32f_u_avx2_fma(float* bVector, const float* aVector,
-                          unsigned int num_points)
+volk_32f_tan_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int eighthPoints = num_points / 8;
-  unsigned int i = 0;
-
-  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
-  __m256 sine, cosine, tangent, condition1, condition2, condition3;
-  __m256i q, r, ones, twos, fours;
-
-  m4pi = _mm256_set1_ps(1.273239545);
-  pio4A = _mm256_set1_ps(0.78515625);
-  pio4B = _mm256_set1_ps(0.241876e-3);
-  ffours = _mm256_set1_ps(4.0);
-  ftwos = _mm256_set1_ps(2.0);
-  fones = _mm256_set1_ps(1.0);
-  fzeroes = _mm256_setzero_ps();
-  ones = _mm256_set1_epi32(1);
-  twos = _mm256_set1_epi32(2);
-  fours = _mm256_set1_epi32(4);
-
-  cp1 = _mm256_set1_ps(1.0);
-  cp2 = _mm256_set1_ps(0.83333333e-1);
-  cp3 = _mm256_set1_ps(0.2777778e-2);
-  cp4 = _mm256_set1_ps(0.49603e-4);
-  cp5 = _mm256_set1_ps(0.551e-6);
-
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_loadu_ps(aPtr);
-    s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
-    q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
-    r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
-
-    s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
-    s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
-
-    s = _mm256_div_ps(s, _mm256_set1_ps(8.0));    // The constant is 2^N, for 3 times argument reduction
-    s = _mm256_mul_ps(s, s);
-    // Evaluate Taylor series
-    s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
-
-    for(i = 0; i < 3; i++){
-      s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    unsigned int i = 0;
+
+    __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m256 sine, cosine, tangent, condition1, condition2, condition3;
+    __m256i q, r, ones, twos, fours;
+
+    m4pi = _mm256_set1_ps(1.273239545);
+    pio4A = _mm256_set1_ps(0.78515625);
+    pio4B = _mm256_set1_ps(0.241876e-3);
+    ffours = _mm256_set1_ps(4.0);
+    ftwos = _mm256_set1_ps(2.0);
+    fones = _mm256_set1_ps(1.0);
+    fzeroes = _mm256_setzero_ps();
+    ones = _mm256_set1_epi32(1);
+    twos = _mm256_set1_epi32(2);
+    fours = _mm256_set1_epi32(4);
+
+    cp1 = _mm256_set1_ps(1.0);
+    cp2 = _mm256_set1_ps(0.83333333e-1);
+    cp3 = _mm256_set1_ps(0.2777778e-2);
+    cp4 = _mm256_set1_ps(0.49603e-4);
+    cp5 = _mm256_set1_ps(0.551e-6);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        s = _mm256_sub_ps(aVal,
+                          _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+                                        _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+        q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+        r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+
+        s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
+        s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
+
+        s = _mm256_div_ps(
+            s,
+            _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm256_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm256_mul_ps(
+            _mm256_fmadd_ps(
+                _mm256_fmsub_ps(
+                    _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
+                s,
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+        }
+        s = _mm256_div_ps(s, ftwos);
+
+        sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+        cosine = _mm256_sub_ps(fones, s);
+
+        condition1 = _mm256_cmp_ps(
+            _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
+            fzeroes,
+            _CMP_NEQ_UQ);
+        condition2 = _mm256_cmp_ps(
+            _mm256_cmp_ps(
+                _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
+            _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
+            _CMP_NEQ_UQ);
+        condition3 = _mm256_cmp_ps(
+            _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
+            fzeroes,
+            _CMP_NEQ_UQ);
+
+        __m256 temp = cosine;
+        cosine =
+            _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
+        sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
+        sine = _mm256_sub_ps(
+            sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+        cosine = _mm256_sub_ps(
+            cosine,
+            _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
+        tangent = _mm256_div_ps(sine, cosine);
+        _mm256_storeu_ps(bPtr, tangent);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = tan(*aPtr++);
      }
-    s = _mm256_div_ps(s, ftwos);
-
-    sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
-    cosine = _mm256_sub_ps(fones, s);
-
-    condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ);
-    condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ);
-    condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes, _CMP_NEQ_UQ);
-
-    __m256 temp = cosine;
-    cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
-    sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
-    sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
-    cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
-    tangent = _mm256_div_ps(sine, cosine);
-    _mm256_storeu_ps(bPtr, tangent);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *bPtr++ = tan(*aPtr++);
-  }
  }
  
  #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
@@ -408,78 +506,109 @@ volk_32f_tan_32f_u_avx2_fma(float* bVector, const float* aVector,
  #include <immintrin.h>
  
  static inline void
-volk_32f_tan_32f_u_avx2(float* bVector, const float* aVector,
-                          unsigned int num_points)
+volk_32f_tan_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int eighthPoints = num_points / 8;
-  unsigned int i = 0;
-
-  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
-  __m256 sine, cosine, tangent, condition1, condition2, condition3;
-  __m256i q, r, ones, twos, fours;
-
-  m4pi = _mm256_set1_ps(1.273239545);
-  pio4A = _mm256_set1_ps(0.78515625);
-  pio4B = _mm256_set1_ps(0.241876e-3);
-  ffours = _mm256_set1_ps(4.0);
-  ftwos = _mm256_set1_ps(2.0);
-  fones = _mm256_set1_ps(1.0);
-  fzeroes = _mm256_setzero_ps();
-  ones = _mm256_set1_epi32(1);
-  twos = _mm256_set1_epi32(2);
-  fours = _mm256_set1_epi32(4);
-
-  cp1 = _mm256_set1_ps(1.0);
-  cp2 = _mm256_set1_ps(0.83333333e-1);
-  cp3 = _mm256_set1_ps(0.2777778e-2);
-  cp4 = _mm256_set1_ps(0.49603e-4);
-  cp5 = _mm256_set1_ps(0.551e-6);
-
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_loadu_ps(aPtr);
-    s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
-    q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
-    r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
-
-    s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
-    s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
-
-    s = _mm256_div_ps(s, _mm256_set1_ps(8.0));    // The constant is 2^N, for 3 times argument reduction
-    s = _mm256_mul_ps(s, s);
-    // Evaluate Taylor series
-    s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
-    for(i = 0; i < 3; i++){
-      s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    unsigned int i = 0;
+
+    __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m256 sine, cosine, tangent, condition1, condition2, condition3;
+    __m256i q, r, ones, twos, fours;
+
+    m4pi = _mm256_set1_ps(1.273239545);
+    pio4A = _mm256_set1_ps(0.78515625);
+    pio4B = _mm256_set1_ps(0.241876e-3);
+    ffours = _mm256_set1_ps(4.0);
+    ftwos = _mm256_set1_ps(2.0);
+    fones = _mm256_set1_ps(1.0);
+    fzeroes = _mm256_setzero_ps();
+    ones = _mm256_set1_epi32(1);
+    twos = _mm256_set1_epi32(2);
+    fours = _mm256_set1_epi32(4);
+
+    cp1 = _mm256_set1_ps(1.0);
+    cp2 = _mm256_set1_ps(0.83333333e-1);
+    cp3 = _mm256_set1_ps(0.2777778e-2);
+    cp4 = _mm256_set1_ps(0.49603e-4);
+    cp5 = _mm256_set1_ps(0.551e-6);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        s = _mm256_sub_ps(aVal,
+                          _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+                                        _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+        q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+        r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+
+        s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
+        s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
+
+        s = _mm256_div_ps(
+            s,
+            _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm256_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm256_mul_ps(
+            _mm256_add_ps(
+                _mm256_mul_ps(
+                    _mm256_sub_ps(
+                        _mm256_mul_ps(
+                            _mm256_add_ps(
+                                _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
+                                              s),
+                                cp3),
+                            s),
+                        cp2),
+                    s),
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+        }
+        s = _mm256_div_ps(s, ftwos);
+
+        sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+        cosine = _mm256_sub_ps(fones, s);
+
+        condition1 = _mm256_cmp_ps(
+            _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
+            fzeroes,
+            _CMP_NEQ_UQ);
+        condition2 = _mm256_cmp_ps(
+            _mm256_cmp_ps(
+                _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
+            _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
+            _CMP_NEQ_UQ);
+        condition3 = _mm256_cmp_ps(
+            _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
+            fzeroes,
+            _CMP_NEQ_UQ);
+
+        __m256 temp = cosine;
+        cosine =
+            _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
+        sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
+        sine = _mm256_sub_ps(
+            sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+        cosine = _mm256_sub_ps(
+            cosine,
+            _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
+        tangent = _mm256_div_ps(sine, cosine);
+        _mm256_storeu_ps(bPtr, tangent);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = tan(*aPtr++);
      }
-    s = _mm256_div_ps(s, ftwos);
-
-    sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
-    cosine = _mm256_sub_ps(fones, s);
-
-    condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ);
-    condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ);
-    condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes, _CMP_NEQ_UQ);
-
-    __m256 temp = cosine;
-    cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
-    sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
-    sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
-    cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
-    tangent = _mm256_div_ps(sine, cosine);
-    _mm256_storeu_ps(bPtr, tangent);
-    aPtr += 8;
-    bPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *bPtr++ = tan(*aPtr++);
-  }
  }
  
  #endif /* LV_HAVE_AVX2 for unaligned */
@@ -491,75 +620,95 @@ volk_32f_tan_32f_u_avx2(float* bVector, const float* aVector,
  static inline void
  volk_32f_tan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  unsigned int quarterPoints = num_points / 4;
-  unsigned int i = 0;
-
-  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
-  __m128 sine, cosine, tangent, condition1, condition2, condition3;
-  __m128i q, r, ones, twos, fours;
-
-  m4pi = _mm_set1_ps(1.273239545);
-  pio4A = _mm_set1_ps(0.78515625);
-  pio4B = _mm_set1_ps(0.241876e-3);
-  ffours = _mm_set1_ps(4.0);
-  ftwos = _mm_set1_ps(2.0);
-  fones = _mm_set1_ps(1.0);
-  fzeroes = _mm_setzero_ps();
-  ones = _mm_set1_epi32(1);
-  twos = _mm_set1_epi32(2);
-  fours = _mm_set1_epi32(4);
-
-  cp1 = _mm_set1_ps(1.0);
-  cp2 = _mm_set1_ps(0.83333333e-1);
-  cp3 = _mm_set1_ps(0.2777778e-2);
-  cp4 = _mm_set1_ps(0.49603e-4);
-  cp5 = _mm_set1_ps(0.551e-6);
-
-  for(;number < quarterPoints; number++){
-    aVal = _mm_loadu_ps(aPtr);
-    s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
-    q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
-    r = _mm_add_epi32(q, _mm_and_si128(q, ones));
-
-    s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
-    s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
-
-    s = _mm_div_ps(s, _mm_set1_ps(8.0));    // The constant is 2^N, for 3 times argument reduction
-    s = _mm_mul_ps(s, s);
-    // Evaluate Taylor series
-    s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
-    for(i = 0; i < 3; i++){
-      s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    unsigned int i = 0;
+
+    __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m128 sine, cosine, tangent, condition1, condition2, condition3;
+    __m128i q, r, ones, twos, fours;
+
+    m4pi = _mm_set1_ps(1.273239545);
+    pio4A = _mm_set1_ps(0.78515625);
+    pio4B = _mm_set1_ps(0.241876e-3);
+    ffours = _mm_set1_ps(4.0);
+    ftwos = _mm_set1_ps(2.0);
+    fones = _mm_set1_ps(1.0);
+    fzeroes = _mm_setzero_ps();
+    ones = _mm_set1_epi32(1);
+    twos = _mm_set1_epi32(2);
+    fours = _mm_set1_epi32(4);
+
+    cp1 = _mm_set1_ps(1.0);
+    cp2 = _mm_set1_ps(0.83333333e-1);
+    cp3 = _mm_set1_ps(0.2777778e-2);
+    cp4 = _mm_set1_ps(0.49603e-4);
+    cp5 = _mm_set1_ps(0.551e-6);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_loadu_ps(aPtr);
+        s = _mm_sub_ps(aVal,
+                       _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+        q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+        r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+        s = _mm_div_ps(
+            s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm_mul_ps(
+            _mm_add_ps(
+                _mm_mul_ps(
+                    _mm_sub_ps(
+                        _mm_mul_ps(
+                            _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
+                                       cp3),
+                            s),
+                        cp2),
+                    s),
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+        }
+        s = _mm_div_ps(s, ftwos);
+
+        sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+        cosine = _mm_sub_ps(fones, s);
+
+        condition1 = _mm_cmpneq_ps(
+            _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+        condition2 = _mm_cmpneq_ps(
+            _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
+            _mm_cmplt_ps(aVal, fzeroes));
+        condition3 = _mm_cmpneq_ps(
+            _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
+
+        __m128 temp = cosine;
+        cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
+        sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
+        sine =
+            _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
+        cosine = _mm_sub_ps(
+            cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
+        tangent = _mm_div_ps(sine, cosine);
+        _mm_storeu_ps(bPtr, tangent);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = tanf(*aPtr++);
      }
-    s = _mm_div_ps(s, ftwos);
-
-    sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
-    cosine = _mm_sub_ps(fones, s);
-
-    condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
-    condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
-    condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
-
-    __m128 temp = cosine;
-    cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
-    sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
-    sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
-    cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
-    tangent = _mm_div_ps(sine, cosine);
-    _mm_storeu_ps(bPtr, tangent);
-    aPtr += 4;
-    bPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *bPtr++ = tanf(*aPtr++);
-  }
  }
  
  #endif /* LV_HAVE_SSE4_1 for unaligned */
@@ -568,16 +717,15 @@ volk_32f_tan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num
  #ifdef LV_HAVE_GENERIC
  
  static inline void
-volk_32f_tan_32f_generic(float* bVector, const float* aVector,
-                         unsigned int num_points)
+volk_32f_tan_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
  {
-  float* bPtr = bVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
  
-  for(; number < num_points; number++){
-    *bPtr++ = tanf(*aPtr++);
-  }
+    for (; number < num_points; number++) {
+        *bPtr++ = tanf(*aPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -587,30 +735,29 @@ volk_32f_tan_32f_generic(float* bVector, const float* aVector,
  #include <volk/volk_neon_intrinsics.h>
  
  static inline void
-volk_32f_tan_32f_neon(float* bVector, const float* aVector,
-                      unsigned int num_points)
+volk_32f_tan_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
  {
      unsigned int number = 0;
      unsigned int quarter_points = num_points / 4;
      float* bVectorPtr = bVector;
      const float* aVectorPtr = aVector;
-    
+
      float32x4_t b_vec;
      float32x4_t a_vec;
-    
-    for(number = 0; number < quarter_points; number++) {
+
+    for (number = 0; number < quarter_points; number++) {
          a_vec = vld1q_f32(aVectorPtr);
          // Prefetch next one, speeds things up
-        __VOLK_PREFETCH(aVectorPtr+4);
+        __VOLK_PREFETCH(aVectorPtr + 4);
          b_vec = _vtanq_f32(a_vec);
          vst1q_f32(bVectorPtr, b_vec);
          // move pointers ahead
-        bVectorPtr+=4;
-        aVectorPtr+=4;
+        bVectorPtr += 4;
+        aVectorPtr += 4;
      }
-    
+
      // Deal with the rest
-    for(number = quarter_points * 4; number < num_points; number++) {
+    for (number = quarter_points * 4; number < num_points; number++) {
          *bVectorPtr++ = tanf(*aVectorPtr++);
      }
  }
diff --git a/kernels/volk/volk_32f_tanh_32f.h b/kernels/volk/volk_32f_tanh_32f.h

index d49432d74d10760aa1bd0505fa5e5e1ef9cf343e..f157d3928b233a4c02afaa9e968fb170aea7dbdb 100644 (file)
--- a/kernels/volk/volk_32f_tanh_32f.h
+++ b/kernels/volk/volk_32f_tanh_32f.h
@@ -69,22 +69,21 @@
  #define INCLUDED_volk_32f_tanh_32f_a_H
  
  #include <inttypes.h>
-#include <stdio.h>
  #include <math.h>
+#include <stdio.h>
  #include <string.h>
  
  #ifdef LV_HAVE_GENERIC
  
  static inline void
-volk_32f_tanh_32f_generic(float* cVector, const float* aVector,
-                          unsigned int num_points)
+volk_32f_tanh_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
  {
-  unsigned int number = 0;
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  for(; number < num_points; number++) {
-    *cPtr++ = tanhf(*aPtr++);
-  }
+    unsigned int number = 0;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    for (; number < num_points; number++) {
+        *cPtr++ = tanhf(*aPtr++);
+    }
  }
  
  #endif /* LV_HAVE_GENERIC */
@@ -93,81 +92,88 @@ volk_32f_tanh_32f_generic(float* cVector, const float* aVector,
  #ifdef LV_HAVE_GENERIC
  
  static inline void
-volk_32f_tanh_32f_series(float* cVector, const float* aVector,
-                         unsigned int num_points)
+volk_32f_tanh_32f_series(float* cVector, const float* aVector, unsigned int num_points)
  {
-  unsigned int number = 0;
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  for(; number < num_points; number++) {
-    if(*aPtr > 4.97)
-      *cPtr++ = 1;
-    else if(*aPtr <= -4.97)
-      *cPtr++ = -1;
-    else {
-      float x2 = (*aPtr) * (*aPtr);
-      float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
-      float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
-      *cPtr++ = a / b;
-      aPtr++;
+    unsigned int number = 0;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    for (; number < num_points; number++) {
+        if (*aPtr > 4.97)
+            *cPtr++ = 1;
+        else if (*aPtr <= -4.97)
+            *cPtr++ = -1;
+        else {
+            float x2 = (*aPtr) * (*aPtr);
+            float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+            float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+            *cPtr++ = a / b;
+            aPtr++;
+        }
      }
-  }
  }
  
  #endif /* LV_HAVE_GENERIC */
  
  
-
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
  static inline void
-volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector,
-                        unsigned int num_points)
+volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-
-  __m128 aVal, cVal, x2, a, b;
-  __m128 const1, const2, const3, const4, const5, const6;
-  const1 = _mm_set_ps1(135135.0f);
-  const2 = _mm_set_ps1(17325.0f);
-  const3 = _mm_set_ps1(378.0f);
-  const4 = _mm_set_ps1(62370.0f);
-  const5 = _mm_set_ps1(3150.0f);
-  const6 = _mm_set_ps1(28.0f);
-  for(;number < quarterPoints; number++){
-
-    aVal = _mm_load_ps(aPtr);
-    x2 = _mm_mul_ps(aVal, aVal);
-    a  = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
-    b  = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
-
-    cVal = _mm_div_ps(a, b);
-
-    _mm_store_ps(cPtr, cVal); // Store the results back into the C container
-
-    aPtr += 4;
-    cPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++) {
-    if(*aPtr > 4.97)
-      *cPtr++ = 1;
-    else if(*aPtr <= -4.97)
-      *cPtr++ = -1;
-    else {
-      float x2 = (*aPtr) * (*aPtr);
-      float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
-      float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
-      *cPtr++ = a / b;
-      aPtr++;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m128 aVal, cVal, x2, a, b;
+    __m128 const1, const2, const3, const4, const5, const6;
+    const1 = _mm_set_ps1(135135.0f);
+    const2 = _mm_set_ps1(17325.0f);
+    const3 = _mm_set_ps1(378.0f);
+    const4 = _mm_set_ps1(62370.0f);
+    const5 = _mm_set_ps1(3150.0f);
+    const6 = _mm_set_ps1(28.0f);
+    for (; number < quarterPoints; number++) {
+
+        aVal = _mm_load_ps(aPtr);
+        x2 = _mm_mul_ps(aVal, aVal);
+        a = _mm_mul_ps(
+            aVal,
+            _mm_add_ps(
+                const1,
+                _mm_mul_ps(x2,
+                           _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
+        b = _mm_add_ps(
+            const1,
+            _mm_mul_ps(
+                x2,
+                _mm_add_ps(const4,
+                           _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
+
+        cVal = _mm_div_ps(a, b);
+
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        if (*aPtr > 4.97)
+            *cPtr++ = 1;
+        else if (*aPtr <= -4.97)
+            *cPtr++ = -1;
+        else {
+            float x2 = (*aPtr) * (*aPtr);
+            float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+            float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+            *cPtr++ = a / b;
+            aPtr++;
+        }
      }
-  }
  }
  #endif /* LV_HAVE_SSE */
  
@@ -176,52 +182,65 @@ volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector,
  #include <immintrin.h>
  
  static inline void
-volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector,
-                        unsigned int num_points)
+volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-
-  __m256 aVal, cVal, x2, a, b;
-  __m256 const1, const2, const3, const4, const5, const6;
-  const1 = _mm256_set1_ps(135135.0f);
-  const2 = _mm256_set1_ps(17325.0f);
-  const3 = _mm256_set1_ps(378.0f);
-  const4 = _mm256_set1_ps(62370.0f);
-  const5 = _mm256_set1_ps(3150.0f);
-  const6 = _mm256_set1_ps(28.0f);
-  for(;number < eighthPoints; number++){
-
-    aVal = _mm256_load_ps(aPtr);
-    x2 = _mm256_mul_ps(aVal, aVal);
-    a  = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
-    b  = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
-
-    cVal = _mm256_div_ps(a, b);
-
-    _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
-
-    aPtr += 8;
-    cPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++) {
-    if(*aPtr > 4.97)
-      *cPtr++ = 1;
-    else if(*aPtr <= -4.97)
-      *cPtr++ = -1;
-    else {
-      float x2 = (*aPtr) * (*aPtr);
-      float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
-      float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
-      *cPtr++ = a / b;
-      aPtr++;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m256 aVal, cVal, x2, a, b;
+    __m256 const1, const2, const3, const4, const5, const6;
+    const1 = _mm256_set1_ps(135135.0f);
+    const2 = _mm256_set1_ps(17325.0f);
+    const3 = _mm256_set1_ps(378.0f);
+    const4 = _mm256_set1_ps(62370.0f);
+    const5 = _mm256_set1_ps(3150.0f);
+    const6 = _mm256_set1_ps(28.0f);
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_load_ps(aPtr);
+        x2 = _mm256_mul_ps(aVal, aVal);
+        a = _mm256_mul_ps(
+            aVal,
+            _mm256_add_ps(
+                const1,
+                _mm256_mul_ps(
+                    x2,
+                    _mm256_add_ps(const2,
+                                  _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
+        b = _mm256_add_ps(
+            const1,
+            _mm256_mul_ps(
+                x2,
+                _mm256_add_ps(
+                    const4,
+                    _mm256_mul_ps(x2,
+                                  _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
+
+        cVal = _mm256_div_ps(a, b);
+
+        _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        if (*aPtr > 4.97)
+            *cPtr++ = 1;
+        else if (*aPtr <= -4.97)
+            *cPtr++ = -1;
+        else {
+            float x2 = (*aPtr) * (*aPtr);
+            float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+            float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+            *cPtr++ = a / b;
+            aPtr++;
+        }
      }
-  }
  }
  #endif /* LV_HAVE_AVX */
  
@@ -229,52 +248,55 @@ volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector,
  #include <immintrin.h>
  
  static inline void
-volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector,
-                        unsigned int num_points)
+volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-
-  __m256 aVal, cVal, x2, a, b;
-  __m256 const1, const2, const3, const4, const5, const6;
-  const1 = _mm256_set1_ps(135135.0f);
-  const2 = _mm256_set1_ps(17325.0f);
-  const3 = _mm256_set1_ps(378.0f);
-  const4 = _mm256_set1_ps(62370.0f);
-  const5 = _mm256_set1_ps(3150.0f);
-  const6 = _mm256_set1_ps(28.0f);
-  for(;number < eighthPoints; number++){
-
-    aVal = _mm256_load_ps(aPtr);
-    x2 = _mm256_mul_ps(aVal, aVal);
-    a  = _mm256_mul_ps(aVal, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2),const1));
-    b  = _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
-
-    cVal = _mm256_div_ps(a, b);
-
-    _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
-
-    aPtr += 8;
-    cPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++) {
-    if(*aPtr > 4.97)
-      *cPtr++ = 1;
-    else if(*aPtr <= -4.97)
-      *cPtr++ = -1;
-    else {
-      float x2 = (*aPtr) * (*aPtr);
-      float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
-      float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
-      *cPtr++ = a / b;
-      aPtr++;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m256 aVal, cVal, x2, a, b;
+    __m256 const1, const2, const3, const4, const5, const6;
+    const1 = _mm256_set1_ps(135135.0f);
+    const2 = _mm256_set1_ps(17325.0f);
+    const3 = _mm256_set1_ps(378.0f);
+    const4 = _mm256_set1_ps(62370.0f);
+    const5 = _mm256_set1_ps(3150.0f);
+    const6 = _mm256_set1_ps(28.0f);
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_load_ps(aPtr);
+        x2 = _mm256_mul_ps(aVal, aVal);
+        a = _mm256_mul_ps(
+            aVal,
+            _mm256_fmadd_ps(
+                x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
+        b = _mm256_fmadd_ps(
+            x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
+
+        cVal = _mm256_div_ps(a, b);
+
+        _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        if (*aPtr > 4.97)
+            *cPtr++ = 1;
+        else if (*aPtr <= -4.97)
+            *cPtr++ = -1;
+        else {
+            float x2 = (*aPtr) * (*aPtr);
+            float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+            float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+            *cPtr++ = a / b;
+            aPtr++;
+        }
      }
-  }
  }
  #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
  
@@ -285,8 +307,8 @@ volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector,
  #define INCLUDED_volk_32f_tanh_32f_u_H
  
  #include <inttypes.h>
-#include <stdio.h>
  #include <math.h>
+#include <stdio.h>
  #include <string.h>
  
  
@@ -294,52 +316,61 @@ volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector,
  #include <xmmintrin.h>
  
  static inline void
-volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector,
-                        unsigned int num_points)
+volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector, unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-
-  __m128 aVal, cVal, x2, a, b;
-  __m128 const1, const2, const3, const4, const5, const6;
-  const1 = _mm_set_ps1(135135.0f);
-  const2 = _mm_set_ps1(17325.0f);
-  const3 = _mm_set_ps1(378.0f);
-  const4 = _mm_set_ps1(62370.0f);
-  const5 = _mm_set_ps1(3150.0f);
-  const6 = _mm_set_ps1(28.0f);
-  for(;number < quarterPoints; number++){
-
-    aVal = _mm_loadu_ps(aPtr);
-    x2 = _mm_mul_ps(aVal, aVal);
-    a  = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
-    b  = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
-
-    cVal = _mm_div_ps(a, b);
-
-    _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
-
-    aPtr += 4;
-    cPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++) {
-    if(*aPtr > 4.97)
-      *cPtr++ = 1;
-    else if(*aPtr <= -4.97)
-      *cPtr++ = -1;
-    else {
-      float x2 = (*aPtr) * (*aPtr);
-      float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
-      float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
-      *cPtr++ = a / b;
-      aPtr++;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m128 aVal, cVal, x2, a, b;
+    __m128 const1, const2, const3, const4, const5, const6;
+    const1 = _mm_set_ps1(135135.0f);
+    const2 = _mm_set_ps1(17325.0f);
+    const3 = _mm_set_ps1(378.0f);
+    const4 = _mm_set_ps1(62370.0f);
+    const5 = _mm_set_ps1(3150.0f);
+    const6 = _mm_set_ps1(28.0f);
+    for (; number < quarterPoints; number++) {
+
+        aVal = _mm_loadu_ps(aPtr);
+        x2 = _mm_mul_ps(aVal, aVal);
+        a = _mm_mul_ps(
+            aVal,
+            _mm_add_ps(
+                const1,
+                _mm_mul_ps(x2,
+                           _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
+        b = _mm_add_ps(
+            const1,
+            _mm_mul_ps(
+                x2,
+                _mm_add_ps(const4,
+                           _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
+
+        cVal = _mm_div_ps(a, b);
+
+        _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        if (*aPtr > 4.97)
+            *cPtr++ = 1;
+        else if (*aPtr <= -4.97)
+            *cPtr++ = -1;
+        else {
+            float x2 = (*aPtr) * (*aPtr);
+            float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+            float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+            *cPtr++ = a / b;
+            aPtr++;
+        }
      }
-  }
  }
  #endif /* LV_HAVE_SSE */
  
@@ -348,52 +379,65 @@ volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector,
  #include <immintrin.h>
  
  static inline void
-volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector,
-                        unsigned int num_points)
+volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-
-  __m256 aVal, cVal, x2, a, b;
-  __m256 const1, const2, const3, const4, const5, const6;
-  const1 = _mm256_set1_ps(135135.0f);
-  const2 = _mm256_set1_ps(17325.0f);
-  const3 = _mm256_set1_ps(378.0f);
-  const4 = _mm256_set1_ps(62370.0f);
-  const5 = _mm256_set1_ps(3150.0f);
-  const6 = _mm256_set1_ps(28.0f);
-  for(;number < eighthPoints; number++){
-
-    aVal = _mm256_loadu_ps(aPtr);
-    x2 = _mm256_mul_ps(aVal, aVal);
-    a  = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
-    b  = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
-
-    cVal = _mm256_div_ps(a, b);
-
-    _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
-
-    aPtr += 8;
-    cPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++) {
-    if(*aPtr > 4.97)
-      *cPtr++ = 1;
-    else if(*aPtr <= -4.97)
-      *cPtr++ = -1;
-    else {
-      float x2 = (*aPtr) * (*aPtr);
-      float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
-      float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
-      *cPtr++ = a / b;
-      aPtr++;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m256 aVal, cVal, x2, a, b;
+    __m256 const1, const2, const3, const4, const5, const6;
+    const1 = _mm256_set1_ps(135135.0f);
+    const2 = _mm256_set1_ps(17325.0f);
+    const3 = _mm256_set1_ps(378.0f);
+    const4 = _mm256_set1_ps(62370.0f);
+    const5 = _mm256_set1_ps(3150.0f);
+    const6 = _mm256_set1_ps(28.0f);
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_loadu_ps(aPtr);
+        x2 = _mm256_mul_ps(aVal, aVal);
+        a = _mm256_mul_ps(
+            aVal,
+            _mm256_add_ps(
+                const1,
+                _mm256_mul_ps(
+                    x2,
+                    _mm256_add_ps(const2,
+                                  _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
+        b = _mm256_add_ps(
+            const1,
+            _mm256_mul_ps(
+                x2,
+                _mm256_add_ps(
+                    const4,
+                    _mm256_mul_ps(x2,
+                                  _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
+
+        cVal = _mm256_div_ps(a, b);
+
+        _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        if (*aPtr > 4.97)
+            *cPtr++ = 1;
+        else if (*aPtr <= -4.97)
+            *cPtr++ = -1;
+        else {
+            float x2 = (*aPtr) * (*aPtr);
+            float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+            float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+            *cPtr++ = a / b;
+            aPtr++;
+        }
      }
-  }
  }
  #endif /* LV_HAVE_AVX */
  
@@ -401,52 +445,55 @@ volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector,
  #include <immintrin.h>
  
  static inline void
-volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector,
-                        unsigned int num_points)
+volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-
-  __m256 aVal, cVal, x2, a, b;
-  __m256 const1, const2, const3, const4, const5, const6;
-  const1 = _mm256_set1_ps(135135.0f);
-  const2 = _mm256_set1_ps(17325.0f);
-  const3 = _mm256_set1_ps(378.0f);
-  const4 = _mm256_set1_ps(62370.0f);
-  const5 = _mm256_set1_ps(3150.0f);
-  const6 = _mm256_set1_ps(28.0f);
-  for(;number < eighthPoints; number++){
-
-    aVal = _mm256_loadu_ps(aPtr);
-    x2 = _mm256_mul_ps(aVal, aVal);
-    a  = _mm256_mul_ps(aVal, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2),const1));
-    b  = _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
-
-    cVal = _mm256_div_ps(a, b);
-
-    _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
-
-    aPtr += 8;
-    cPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++) {
-    if(*aPtr > 4.97)
-      *cPtr++ = 1;
-    else if(*aPtr <= -4.97)
-      *cPtr++ = -1;
-    else {
-      float x2 = (*aPtr) * (*aPtr);
-      float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
-      float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
-      *cPtr++ = a / b;
-      aPtr++;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m256 aVal, cVal, x2, a, b;
+    __m256 const1, const2, const3, const4, const5, const6;
+    const1 = _mm256_set1_ps(135135.0f);
+    const2 = _mm256_set1_ps(17325.0f);
+    const3 = _mm256_set1_ps(378.0f);
+    const4 = _mm256_set1_ps(62370.0f);
+    const5 = _mm256_set1_ps(3150.0f);
+    const6 = _mm256_set1_ps(28.0f);
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_loadu_ps(aPtr);
+        x2 = _mm256_mul_ps(aVal, aVal);
+        a = _mm256_mul_ps(
+            aVal,
+            _mm256_fmadd_ps(
+                x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
+        b = _mm256_fmadd_ps(
+            x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
+
+        cVal = _mm256_div_ps(a, b);
+
+        _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        if (*aPtr > 4.97)
+            *cPtr++ = 1;
+        else if (*aPtr <= -4.97)
+            *cPtr++ = -1;
+        else {
+            float x2 = (*aPtr) * (*aPtr);
+            float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+            float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+            *cPtr++ = a / b;
+            aPtr++;
+        }
      }
-  }
  }
  #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
  
diff --git a/kernels/volk/volk_32f_x2_add_32f.h b/kernels/volk/volk_32f_x2_add_32f.h

index ce18092877d48b5ea18a12df7966a472221afd1b..e4b7e93cfdb77753e9817621ac4739111d659498 100644 (file)
--- a/kernels/volk/volk_32f_x2_add_32f.h
+++ b/kernels/volk/volk_32f_x2_add_32f.h
@@ -31,8 +31,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_x2_add_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
- * \endcode
+ * void volk_32f_x2_add_32f(float* cVector, const float* aVector, const float* bVector,
+ * unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li aVector: First vector of input points.
@@ -44,7 +44,8 @@
   *
   * \b Example
   *
- * The follow example adds the increasing and decreasing vectors such that the result of every summation pair is 10
+ * The follow example adds the increasing and decreasing vectors such that the result of
+ * every summation pair is 10
   *
   * \code
   *   int N = 10;
@@ -79,37 +80,38 @@
  #ifdef LV_HAVE_AVX512F
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_add_32f_u_avx512f(float* cVector, const float* aVector,
-                          const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_add_32f_u_avx512f(float* cVector,
+                                                 const float* aVector,
+                                                 const float* bVector,
+                                                 unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m512 aVal, bVal, cVal;
-  for(;number < sixteenthPoints; number++){
+    __m512 aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
  
-    aVal = _mm512_loadu_ps(aPtr);
-    bVal = _mm512_loadu_ps(bPtr);
+        aVal = _mm512_loadu_ps(aPtr);
+        bVal = _mm512_loadu_ps(bPtr);
  
-    cVal = _mm512_add_ps(aVal, bVal);
+        cVal = _mm512_add_ps(aVal, bVal);
  
-    _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container
+        _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 16;
-    bPtr += 16;
-    cPtr += 16;
-  }
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
  
-  number = sixteenthPoints * 16;
+    number = sixteenthPoints * 16;
  
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) + (*bPtr++);
-  }
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX512F */
@@ -118,35 +120,36 @@ volk_32f_x2_add_32f_u_avx512f(float* cVector, const float* aVector,
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_add_32f_u_avx(float* cVector, const float* aVector,
-                          const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_add_32f_u_avx(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
-  __m256 aVal, bVal, cVal;
-  for(;number < eighthPoints; number++){
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    __m256 aVal, bVal, cVal;
+    for (; number < eighthPoints; number++) {
  
-    aVal = _mm256_loadu_ps(aPtr);
-    bVal = _mm256_loadu_ps(bPtr);
+        aVal = _mm256_loadu_ps(aPtr);
+        bVal = _mm256_loadu_ps(bPtr);
  
-    cVal = _mm256_add_ps(aVal, bVal);
+        cVal = _mm256_add_ps(aVal, bVal);
  
-    _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+        _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
  
-  number = eighthPoints * 8;
+    number = eighthPoints * 8;
  
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) + (*bPtr++);
-  }
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
@@ -154,54 +157,56 @@ volk_32f_x2_add_32f_u_avx(float* cVector, const float* aVector,
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVector,
-                          const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_add_32f_u_sse(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m128 aVal, bVal, cVal;
-  for(;number < quarterPoints; number++){
+    __m128 aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
  
-    aVal = _mm_loadu_ps(aPtr);
-    bVal = _mm_loadu_ps(bPtr);
+        aVal = _mm_loadu_ps(aPtr);
+        bVal = _mm_loadu_ps(bPtr);
  
-    cVal = _mm_add_ps(aVal, bVal);
+        cVal = _mm_add_ps(aVal, bVal);
  
-    _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+        _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) + (*bPtr++);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_x2_add_32f_generic(float* cVector, const float* aVector,
-                            const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_add_32f_generic(float* cVector,
+                                               const float* aVector,
+                                               const float* bVector,
+                                               unsigned int num_points)
  {
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = (*aPtr++) + (*bPtr++);
-  }
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -216,37 +221,38 @@ volk_32f_x2_add_32f_generic(float* cVector, const float* aVector,
  #ifdef LV_HAVE_AVX512F
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_add_32f_a_avx512f(float* cVector, const float* aVector,
-                          const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_add_32f_a_avx512f(float* cVector,
+                                                 const float* aVector,
+                                                 const float* bVector,
+                                                 unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m512 aVal, bVal, cVal;
-  for(;number < sixteenthPoints; number++){
+    __m512 aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
  
-    aVal = _mm512_load_ps(aPtr);
-    bVal = _mm512_load_ps(bPtr);
+        aVal = _mm512_load_ps(aPtr);
+        bVal = _mm512_load_ps(bPtr);
  
-    cVal = _mm512_add_ps(aVal, bVal);
+        cVal = _mm512_add_ps(aVal, bVal);
  
-    _mm512_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 16;
-    bPtr += 16;
-    cPtr += 16;
-  }
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
  
-  number = sixteenthPoints * 16;
+    number = sixteenthPoints * 16;
  
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) + (*bPtr++);
-  }
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX512F */
@@ -255,70 +261,73 @@ volk_32f_x2_add_32f_a_avx512f(float* cVector, const float* aVector,
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_add_32f_a_avx(float* cVector, const float* aVector,
-                          const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_add_32f_a_avx(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m256 aVal, bVal, cVal;
-  for(;number < eighthPoints; number++){
+    __m256 aVal, bVal, cVal;
+    for (; number < eighthPoints; number++) {
  
-    aVal = _mm256_load_ps(aPtr);
-    bVal = _mm256_load_ps(bPtr);
+        aVal = _mm256_load_ps(aPtr);
+        bVal = _mm256_load_ps(bPtr);
  
-    cVal = _mm256_add_ps(aVal, bVal);
+        cVal = _mm256_add_ps(aVal, bVal);
  
-    _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) + (*bPtr++);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_add_32f_a_sse(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m128 aVal, bVal, cVal;
-  for(;number < quarterPoints; number++){
-    aVal = _mm_load_ps(aPtr);
-    bVal = _mm_load_ps(bPtr);
+    __m128 aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        bVal = _mm_load_ps(bPtr);
  
-    cVal = _mm_add_ps(aVal, bVal);
+        cVal = _mm_add_ps(aVal, bVal);
  
-    _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) + (*bPtr++);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_SSE */
  
@@ -326,78 +335,89 @@ volk_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVe
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_32f_x2_add_32f_u_neon(float* cVector, const float* aVector,
-                           const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_add_32f_u_neon(float* cVector,
+                                              const float* aVector,
+                                              const float* bVector,
+                                              unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
-  float32x4_t aVal, bVal, cVal;
-  for(number=0; number < quarterPoints; number++){
-    // Load in to NEON registers
-    aVal = vld1q_f32(aPtr);
-    bVal = vld1q_f32(bPtr);
-    __VOLK_PREFETCH(aPtr+4);
-    __VOLK_PREFETCH(bPtr+4);
-
-    // vector add
-    cVal = vaddq_f32(aVal, bVal);
-    // Store the results back into the C container
-    vst1q_f32(cPtr,cVal);
-
-    aPtr += 4; // q uses quadwords, 4 floats per vadd
-    bPtr += 4;
-    cPtr += 4;
-  }
-
-  number = quarterPoints * 4; // should be = num_points
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) + (*bPtr++);
-  }
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    float32x4_t aVal, bVal, cVal;
+    for (number = 0; number < quarterPoints; number++) {
+        // Load in to NEON registers
+        aVal = vld1q_f32(aPtr);
+        bVal = vld1q_f32(bPtr);
+        __VOLK_PREFETCH(aPtr + 4);
+        __VOLK_PREFETCH(bPtr + 4);
+
+        // vector add
+        cVal = vaddq_f32(aVal, bVal);
+        // Store the results back into the C container
+        vst1q_f32(cPtr, cVal);
+
+        aPtr += 4; // q uses quadwords, 4 floats per vadd
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4; // should be = num_points
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_NEON */
  
  #ifdef LV_HAVE_NEONV7
-extern void volk_32f_x2_add_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+extern void volk_32f_x2_add_32f_a_neonasm(float* cVector,
+                                          const float* aVector,
+                                          const float* bVector,
+                                          unsigned int num_points);
  #endif /* LV_HAVE_NEONV7 */
  
  #ifdef LV_HAVE_NEONV7
-extern void volk_32f_x2_add_32f_a_neonpipeline(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+extern void volk_32f_x2_add_32f_a_neonpipeline(float* cVector,
+                                               const float* aVector,
+                                               const float* bVector,
+                                               unsigned int num_points);
  #endif /* LV_HAVE_NEONV7 */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_x2_add_32f_a_generic(float* cVector, const float* aVector,
-                              const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_add_32f_a_generic(float* cVector,
+                                                 const float* aVector,
+                                                 const float* bVector,
+                                                 unsigned int num_points)
  {
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = (*aPtr++) + (*bPtr++);
-  }
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
  #ifdef LV_HAVE_ORC
  
-extern void
-volk_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector,
-                               const float* bVector, unsigned int num_points);
+extern void volk_32f_x2_add_32f_a_orc_impl(float* cVector,
+                                           const float* aVector,
+                                           const float* bVector,
+                                           unsigned int num_points);
  
-static inline void
-volk_32f_x2_add_32f_u_orc(float* cVector, const float* aVector,
-                          const float* bVector, unsigned int num_points){
-  volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+static inline void volk_32f_x2_add_32f_u_orc(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
+{
+    volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
  }
  
  #endif /* LV_HAVE_ORC */
diff --git a/kernels/volk/volk_32f_x2_divide_32f.h b/kernels/volk/volk_32f_x2_divide_32f.h

index 130767f2692d7b8c9ab6c0ad16f9d347ef72bd60..8b803654ee9fdbf6104ad4e524c1aee5534ce154 100644 (file)
--- a/kernels/volk/volk_32f_x2_divide_32f.h
+++ b/kernels/volk/volk_32f_x2_divide_32f.h
@@ -31,8 +31,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_x2_divide_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
- * \endcode
+ * void volk_32f_x2_divide_32f(float* cVector, const float* aVector, const float* bVector,
+ * unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li aVector: First vector of input points.
@@ -77,35 +77,36 @@
  #ifdef LV_HAVE_AVX512F
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_divide_32f_a_avx512f(float* cVector, const float* aVector,
-                             const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_divide_32f_a_avx512f(float* cVector,
+                                                    const float* aVector,
+                                                    const float* bVector,
+                                                    unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m512 aVal, bVal, cVal;
-  for(;number < sixteenthPoints; number++){
-    aVal = _mm512_load_ps(aPtr);
-    bVal = _mm512_load_ps(bPtr);
+    __m512 aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
+        aVal = _mm512_load_ps(aPtr);
+        bVal = _mm512_load_ps(bPtr);
  
-    cVal = _mm512_div_ps(aVal, bVal);
+        cVal = _mm512_div_ps(aVal, bVal);
  
-    _mm512_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 16;
-    bPtr += 16;
-    cPtr += 16;
-  }
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
  
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) / (*bPtr++);
-  }
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_AVX512F */
  
@@ -113,35 +114,36 @@ volk_32f_x2_divide_32f_a_avx512f(float* cVector, const float* aVector,
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_divide_32f_a_avx(float* cVector, const float* aVector,
-                             const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_divide_32f_a_avx(float* cVector,
+                                                const float* aVector,
+                                                const float* bVector,
+                                                unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m256 aVal, bVal, cVal;
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_load_ps(aPtr);
-    bVal = _mm256_load_ps(bPtr);
+    __m256 aVal, bVal, cVal;
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        bVal = _mm256_load_ps(bPtr);
  
-    cVal = _mm256_div_ps(aVal, bVal);
+        cVal = _mm256_div_ps(aVal, bVal);
  
-    _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) / (*bPtr++);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
@@ -149,35 +151,36 @@ volk_32f_x2_divide_32f_a_avx(float* cVector, const float* aVector,
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32f_x2_divide_32f_a_sse(float* cVector, const float* aVector,
-                             const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_divide_32f_a_sse(float* cVector,
+                                                const float* aVector,
+                                                const float* bVector,
+                                                unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m128 aVal, bVal, cVal;
-  for(;number < quarterPoints; number++){
-    aVal = _mm_load_ps(aPtr);
-    bVal = _mm_load_ps(bPtr);
+    __m128 aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        bVal = _mm_load_ps(bPtr);
  
-    cVal = _mm_div_ps(aVal, bVal);
+        cVal = _mm_div_ps(aVal, bVal);
  
-    _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) / (*bPtr++);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_SSE */
  
@@ -185,54 +188,55 @@ volk_32f_x2_divide_32f_a_sse(float* cVector, const float* aVector,
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_32f_x2_divide_32f_neon(float* cVector, const float* aVector,
-                           const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_divide_32f_neon(float* cVector,
+                                               const float* aVector,
+                                               const float* bVector,
+                                               unsigned int num_points)
  {
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr = bVector;
-
-  float32x4x4_t aVal, bVal, bInv, cVal;
-
-  const unsigned int eighthPoints = num_points / 16;
-  unsigned int number = 0;
-  for(; number < eighthPoints; number++){
-    aVal = vld4q_f32(aPtr);
-    aPtr += 16;
-    bVal = vld4q_f32(bPtr);
-    bPtr += 16;
-
-    __VOLK_PREFETCH(aPtr+16);
-    __VOLK_PREFETCH(bPtr+16);
-
-    bInv.val[0] = vrecpeq_f32(bVal.val[0]);
-    bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
-    bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
-    cVal.val[0] = vmulq_f32(aVal.val[0], bInv.val[0]);
-
-    bInv.val[1] = vrecpeq_f32(bVal.val[1]);
-    bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
-    bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
-    cVal.val[1] = vmulq_f32(aVal.val[1], bInv.val[1]);
-
-    bInv.val[2] = vrecpeq_f32(bVal.val[2]);
-    bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
-    bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
-    cVal.val[2] = vmulq_f32(aVal.val[2], bInv.val[2]);
-
-    bInv.val[3] = vrecpeq_f32(bVal.val[3]);
-    bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
-    bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
-    cVal.val[3] = vmulq_f32(aVal.val[3], bInv.val[3]);
-
-    vst4q_f32(cPtr, cVal);
-    cPtr += 16;
-  }
-
-  for(number = eighthPoints * 16; number < num_points; number++){
-    *cPtr++ = (*aPtr++) / (*bPtr++);
-  }
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    float32x4x4_t aVal, bVal, bInv, cVal;
+
+    const unsigned int eighthPoints = num_points / 16;
+    unsigned int number = 0;
+    for (; number < eighthPoints; number++) {
+        aVal = vld4q_f32(aPtr);
+        aPtr += 16;
+        bVal = vld4q_f32(bPtr);
+        bPtr += 16;
+
+        __VOLK_PREFETCH(aPtr + 16);
+        __VOLK_PREFETCH(bPtr + 16);
+
+        bInv.val[0] = vrecpeq_f32(bVal.val[0]);
+        bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
+        bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
+        cVal.val[0] = vmulq_f32(aVal.val[0], bInv.val[0]);
+
+        bInv.val[1] = vrecpeq_f32(bVal.val[1]);
+        bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
+        bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
+        cVal.val[1] = vmulq_f32(aVal.val[1], bInv.val[1]);
+
+        bInv.val[2] = vrecpeq_f32(bVal.val[2]);
+        bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
+        bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
+        cVal.val[2] = vmulq_f32(aVal.val[2], bInv.val[2]);
+
+        bInv.val[3] = vrecpeq_f32(bVal.val[3]);
+        bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
+        bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
+        cVal.val[3] = vmulq_f32(aVal.val[3], bInv.val[3]);
+
+        vst4q_f32(cPtr, cVal);
+        cPtr += 16;
+    }
+
+    for (number = eighthPoints * 16; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_NEON */
@@ -240,38 +244,40 @@ volk_32f_x2_divide_32f_neon(float* cVector, const float* aVector,
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_x2_divide_32f_generic(float* cVector, const float* aVector,
-                               const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_divide_32f_generic(float* cVector,
+                                                  const float* aVector,
+                                                  const float* bVector,
+                                                  unsigned int num_points)
  {
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = (*aPtr++) / (*bPtr++);
-  }
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
  #ifdef LV_HAVE_ORC
  
-extern void
-volk_32f_x2_divide_32f_a_orc_impl(float* cVector, const float* aVector,
-                                  const float* bVector, unsigned int num_points);
+extern void volk_32f_x2_divide_32f_a_orc_impl(float* cVector,
+                                              const float* aVector,
+                                              const float* bVector,
+                                              unsigned int num_points);
  
-static inline void
-volk_32f_x2_divide_32f_u_orc(float* cVector, const float* aVector,
-                             const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_divide_32f_u_orc(float* cVector,
+                                                const float* aVector,
+                                                const float* bVector,
+                                                unsigned int num_points)
  {
-  volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+    volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points);
  }
  #endif /* LV_HAVE_ORC */
  
  
-
  #endif /* INCLUDED_volk_32f_x2_divide_32f_a_H */
  
  
@@ -284,35 +290,36 @@ volk_32f_x2_divide_32f_u_orc(float* cVector, const float* aVector,
  #ifdef LV_HAVE_AVX512F
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_divide_32f_u_avx512f(float* cVector, const float* aVector,
-                             const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_divide_32f_u_avx512f(float* cVector,
+                                                    const float* aVector,
+                                                    const float* bVector,
+                                                    unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m512 aVal, bVal, cVal;
-  for(;number < sixteenthPoints; number++){
-    aVal = _mm512_loadu_ps(aPtr);
-    bVal = _mm512_loadu_ps(bPtr);
+    __m512 aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
+        aVal = _mm512_loadu_ps(aPtr);
+        bVal = _mm512_loadu_ps(bPtr);
  
-    cVal = _mm512_div_ps(aVal, bVal);
+        cVal = _mm512_div_ps(aVal, bVal);
  
-    _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container
+        _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 16;
-    bPtr += 16;
-    cPtr += 16;
-  }
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
  
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) / (*bPtr++);
-  }
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_AVX512F */
  
@@ -320,35 +327,36 @@ volk_32f_x2_divide_32f_u_avx512f(float* cVector, const float* aVector,
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_divide_32f_u_avx(float* cVector, const float* aVector,
-                             const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_divide_32f_u_avx(float* cVector,
+                                                const float* aVector,
+                                                const float* bVector,
+                                                unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m256 aVal, bVal, cVal;
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_loadu_ps(aPtr);
-    bVal = _mm256_loadu_ps(bPtr);
+    __m256 aVal, bVal, cVal;
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        bVal = _mm256_loadu_ps(bPtr);
  
-    cVal = _mm256_div_ps(aVal, bVal);
+        cVal = _mm256_div_ps(aVal, bVal);
  
-    _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+        _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) / (*bPtr++);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
diff --git a/kernels/volk/volk_32f_x2_dot_prod_16i.h b/kernels/volk/volk_32f_x2_dot_prod_16i.h

index c1b5a8227d73093fa2c0384120c9a6644eba098c..4da7db65986fcf676a6114a5ae2eb65e683f6b24 100644 (file)
--- a/kernels/volk/volk_32f_x2_dot_prod_16i.h
+++ b/kernels/volk/volk_32f_x2_dot_prod_16i.h
@@ -33,8 +33,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_x2_dot_prod_16i(int16_t* result, const float* input, const float* taps, unsigned int num_points)
- * \endcode
+ * void volk_32f_x2_dot_prod_16i(int16_t* result, const float* input, const float* taps,
+ * unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li input: vector of floats.
@@ -58,25 +58,29 @@
  #ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H
  #define INCLUDED_volk_32f_x2_dot_prod_16i_H
  
-#include <volk/volk_common.h>
  #include <stdio.h>
+#include <volk/volk_common.h>
  
  
  #ifdef LV_HAVE_GENERIC
  
  
-static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result,
+                                                    const float* input,
+                                                    const float* taps,
+                                                    unsigned int num_points)
+{
  
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr=  taps;
-  unsigned int number = 0;
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+    unsigned int number = 0;
  
-  for(number = 0; number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
+    for (number = 0; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
  
-  *result = (int16_t)dotProduct;
+    *result = (int16_t)dotProduct;
  }
  
  #endif /*LV_HAVE_GENERIC*/
@@ -84,68 +88,73 @@ static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, const float
  
  #ifdef LV_HAVE_SSE
  
-static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const  float* input, const  float* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 a0Val, a1Val, a2Val, a3Val;
-  __m128 b0Val, b1Val, b2Val, b3Val;
-  __m128 c0Val, c1Val, c2Val, c3Val;
-
-  __m128 dotProdVal0 = _mm_setzero_ps();
-  __m128 dotProdVal1 = _mm_setzero_ps();
-  __m128 dotProdVal2 = _mm_setzero_ps();
-  __m128 dotProdVal3 = _mm_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    a0Val = _mm_load_ps(aPtr);
-    a1Val = _mm_load_ps(aPtr+4);
-    a2Val = _mm_load_ps(aPtr+8);
-    a3Val = _mm_load_ps(aPtr+12);
-    b0Val = _mm_load_ps(bPtr);
-    b1Val = _mm_load_ps(bPtr+4);
-    b2Val = _mm_load_ps(bPtr+8);
-    b3Val = _mm_load_ps(bPtr+12);
-
-    c0Val = _mm_mul_ps(a0Val, b0Val);
-    c1Val = _mm_mul_ps(a1Val, b1Val);
-    c2Val = _mm_mul_ps(a2Val, b2Val);
-    c3Val = _mm_mul_ps(a3Val, b3Val);
-
-    dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
-    dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
-    dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
-    dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
-
-    aPtr += 16;
-    bPtr += 16;
-  }
-
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-
-  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-
-  number = sixteenthPoints*16;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = (short)dotProduct;
+static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result,
+                                                  const float* input,
+                                                  const float* taps,
+                                                  unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m128 a0Val, a1Val, a2Val, a3Val;
+    __m128 b0Val, b1Val, b2Val, b3Val;
+    __m128 c0Val, c1Val, c2Val, c3Val;
+
+    __m128 dotProdVal0 = _mm_setzero_ps();
+    __m128 dotProdVal1 = _mm_setzero_ps();
+    __m128 dotProdVal2 = _mm_setzero_ps();
+    __m128 dotProdVal3 = _mm_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm_load_ps(aPtr);
+        a1Val = _mm_load_ps(aPtr + 4);
+        a2Val = _mm_load_ps(aPtr + 8);
+        a3Val = _mm_load_ps(aPtr + 12);
+        b0Val = _mm_load_ps(bPtr);
+        b1Val = _mm_load_ps(bPtr + 4);
+        b2Val = _mm_load_ps(bPtr + 8);
+        b3Val = _mm_load_ps(bPtr + 12);
+
+        c0Val = _mm_mul_ps(a0Val, b0Val);
+        c1Val = _mm_mul_ps(a1Val, b1Val);
+        c2Val = _mm_mul_ps(a2Val, b2Val);
+        c3Val = _mm_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 16;
+        bPtr += 16;
+    }
+
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+    _mm_store_ps(dotProductVector,
+                 dotProdVal0); // Store the results back into the dot product vector
+
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = (short)dotProduct;
  }
  
  #endif /*LV_HAVE_SSE*/
@@ -153,66 +162,71 @@ static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const  float*
  
  #if LV_HAVE_AVX2 && LV_HAVE_FMA
  
-static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result, const  float* input, const  float* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int thirtysecondPoints = num_points / 32;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m256 a0Val, a1Val, a2Val, a3Val;
-  __m256 b0Val, b1Val, b2Val, b3Val;
-
-  __m256 dotProdVal0 = _mm256_setzero_ps();
-  __m256 dotProdVal1 = _mm256_setzero_ps();
-  __m256 dotProdVal2 = _mm256_setzero_ps();
-  __m256 dotProdVal3 = _mm256_setzero_ps();
-
-  for(;number < thirtysecondPoints; number++){
-
-    a0Val = _mm256_load_ps(aPtr);
-    a1Val = _mm256_load_ps(aPtr+8);
-    a2Val = _mm256_load_ps(aPtr+16);
-    a3Val = _mm256_load_ps(aPtr+24);
-    b0Val = _mm256_load_ps(bPtr);
-    b1Val = _mm256_load_ps(bPtr+8);
-    b2Val = _mm256_load_ps(bPtr+16);
-    b3Val = _mm256_load_ps(bPtr+24);
-
-    dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
-    dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
-    dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
-    dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
-
-    aPtr += 32;
-    bPtr += 32;
-  }
-
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
-  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-  dotProduct += dotProductVector[4];
-  dotProduct += dotProductVector[5];
-  dotProduct += dotProductVector[6];
-  dotProduct += dotProductVector[7];
-
-  number = thirtysecondPoints*32;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = (short)dotProduct;
+static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result,
+                                                       const float* input,
+                                                       const float* taps,
+                                                       unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int thirtysecondPoints = num_points / 32;
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val, b1Val, b2Val, b3Val;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 dotProdVal2 = _mm256_setzero_ps();
+    __m256 dotProdVal3 = _mm256_setzero_ps();
+
+    for (; number < thirtysecondPoints; number++) {
+
+        a0Val = _mm256_load_ps(aPtr);
+        a1Val = _mm256_load_ps(aPtr + 8);
+        a2Val = _mm256_load_ps(aPtr + 16);
+        a3Val = _mm256_load_ps(aPtr + 24);
+        b0Val = _mm256_load_ps(bPtr);
+        b1Val = _mm256_load_ps(bPtr + 8);
+        b2Val = _mm256_load_ps(bPtr + 16);
+        b3Val = _mm256_load_ps(bPtr + 24);
+
+        dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
+        dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
+        dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
+        dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
+
+        aPtr += 32;
+        bPtr += 32;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+    dotProduct += dotProductVector[4];
+    dotProduct += dotProductVector[5];
+    dotProduct += dotProductVector[6];
+    dotProduct += dotProductVector[7];
+
+    number = thirtysecondPoints * 32;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = (short)dotProduct;
  }
  
  #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
@@ -220,146 +234,156 @@ static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result, const  f
  
  #ifdef LV_HAVE_AVX
  
-static inline void volk_32f_x2_dot_prod_16i_a_avx(int16_t* result, const  float* input, const  float* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int thirtysecondPoints = num_points / 32;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m256 a0Val, a1Val, a2Val, a3Val;
-  __m256 b0Val, b1Val, b2Val, b3Val;
-  __m256 c0Val, c1Val, c2Val, c3Val;
-
-  __m256 dotProdVal0 = _mm256_setzero_ps();
-  __m256 dotProdVal1 = _mm256_setzero_ps();
-  __m256 dotProdVal2 = _mm256_setzero_ps();
-  __m256 dotProdVal3 = _mm256_setzero_ps();
-
-  for(;number < thirtysecondPoints; number++){
-
-    a0Val = _mm256_load_ps(aPtr);
-    a1Val = _mm256_load_ps(aPtr+8);
-    a2Val = _mm256_load_ps(aPtr+16);
-    a3Val = _mm256_load_ps(aPtr+24);
-    b0Val = _mm256_load_ps(bPtr);
-    b1Val = _mm256_load_ps(bPtr+8);
-    b2Val = _mm256_load_ps(bPtr+16);
-    b3Val = _mm256_load_ps(bPtr+24);
-
-    c0Val = _mm256_mul_ps(a0Val, b0Val);
-    c1Val = _mm256_mul_ps(a1Val, b1Val);
-    c2Val = _mm256_mul_ps(a2Val, b2Val);
-    c3Val = _mm256_mul_ps(a3Val, b3Val);
-
-    dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
-    dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
-    dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
-    dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
-
-    aPtr += 32;
-    bPtr += 32;
-  }
-
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
-  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-  dotProduct += dotProductVector[4];
-  dotProduct += dotProductVector[5];
-  dotProduct += dotProductVector[6];
-  dotProduct += dotProductVector[7];
-
-  number = thirtysecondPoints*32;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = (short)dotProduct;
+static inline void volk_32f_x2_dot_prod_16i_a_avx(int16_t* result,
+                                                  const float* input,
+                                                  const float* taps,
+                                                  unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int thirtysecondPoints = num_points / 32;
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val, b1Val, b2Val, b3Val;
+    __m256 c0Val, c1Val, c2Val, c3Val;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 dotProdVal2 = _mm256_setzero_ps();
+    __m256 dotProdVal3 = _mm256_setzero_ps();
+
+    for (; number < thirtysecondPoints; number++) {
+
+        a0Val = _mm256_load_ps(aPtr);
+        a1Val = _mm256_load_ps(aPtr + 8);
+        a2Val = _mm256_load_ps(aPtr + 16);
+        a3Val = _mm256_load_ps(aPtr + 24);
+        b0Val = _mm256_load_ps(bPtr);
+        b1Val = _mm256_load_ps(bPtr + 8);
+        b2Val = _mm256_load_ps(bPtr + 16);
+        b3Val = _mm256_load_ps(bPtr + 24);
+
+        c0Val = _mm256_mul_ps(a0Val, b0Val);
+        c1Val = _mm256_mul_ps(a1Val, b1Val);
+        c2Val = _mm256_mul_ps(a2Val, b2Val);
+        c3Val = _mm256_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 32;
+        bPtr += 32;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+    dotProduct += dotProductVector[4];
+    dotProduct += dotProductVector[5];
+    dotProduct += dotProductVector[6];
+    dotProduct += dotProductVector[7];
+
+    number = thirtysecondPoints * 32;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = (short)dotProduct;
  }
  
  #endif /*LV_HAVE_AVX*/
  
  #ifdef LV_HAVE_AVX512F
  
-static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result, const  float* input, const  float* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int sixtyfourthPoints = num_points / 64;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m512 a0Val, a1Val, a2Val, a3Val;
-  __m512 b0Val, b1Val, b2Val, b3Val;
-
-  __m512 dotProdVal0 = _mm512_setzero_ps();
-  __m512 dotProdVal1 = _mm512_setzero_ps();
-  __m512 dotProdVal2 = _mm512_setzero_ps();
-  __m512 dotProdVal3 = _mm512_setzero_ps();
-
-  for(;number < sixtyfourthPoints; number++){
-
-    a0Val = _mm512_load_ps(aPtr);
-    a1Val = _mm512_load_ps(aPtr+16);
-    a2Val = _mm512_load_ps(aPtr+32);
-    a3Val = _mm512_load_ps(aPtr+48);
-    b0Val = _mm512_load_ps(bPtr);
-    b1Val = _mm512_load_ps(bPtr+16);
-    b2Val = _mm512_load_ps(bPtr+32);
-    b3Val = _mm512_load_ps(bPtr+48);
-
-    dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
-    dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
-    dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
-    dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
-
-    aPtr += 64;
-    bPtr += 64;
-  }
-
-  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
-
-  _mm512_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-  dotProduct += dotProductVector[4];
-  dotProduct += dotProductVector[5];
-  dotProduct += dotProductVector[6];
-  dotProduct += dotProductVector[7];
-  dotProduct += dotProductVector[8];
-  dotProduct += dotProductVector[9];
-  dotProduct += dotProductVector[10];
-  dotProduct += dotProductVector[11];
-  dotProduct += dotProductVector[12];
-  dotProduct += dotProductVector[13];
-  dotProduct += dotProductVector[14];
-  dotProduct += dotProductVector[15];
-
-  number = sixtyfourthPoints*64;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = (short)dotProduct;
+static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result,
+                                                      const float* input,
+                                                      const float* taps,
+                                                      unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixtyfourthPoints = num_points / 64;
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m512 a0Val, a1Val, a2Val, a3Val;
+    __m512 b0Val, b1Val, b2Val, b3Val;
+
+    __m512 dotProdVal0 = _mm512_setzero_ps();
+    __m512 dotProdVal1 = _mm512_setzero_ps();
+    __m512 dotProdVal2 = _mm512_setzero_ps();
+    __m512 dotProdVal3 = _mm512_setzero_ps();
+
+    for (; number < sixtyfourthPoints; number++) {
+
+        a0Val = _mm512_load_ps(aPtr);
+        a1Val = _mm512_load_ps(aPtr + 16);
+        a2Val = _mm512_load_ps(aPtr + 32);
+        a3Val = _mm512_load_ps(aPtr + 48);
+        b0Val = _mm512_load_ps(bPtr);
+        b1Val = _mm512_load_ps(bPtr + 16);
+        b2Val = _mm512_load_ps(bPtr + 32);
+        b3Val = _mm512_load_ps(bPtr + 48);
+
+        dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
+        dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
+        dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
+        dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
+
+        aPtr += 64;
+        bPtr += 64;
+    }
+
+    dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
+
+    _mm512_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+    dotProduct += dotProductVector[4];
+    dotProduct += dotProductVector[5];
+    dotProduct += dotProductVector[6];
+    dotProduct += dotProductVector[7];
+    dotProduct += dotProductVector[8];
+    dotProduct += dotProductVector[9];
+    dotProduct += dotProductVector[10];
+    dotProduct += dotProductVector[11];
+    dotProduct += dotProductVector[12];
+    dotProduct += dotProductVector[13];
+    dotProduct += dotProductVector[14];
+    dotProduct += dotProductVector[15];
+
+    number = sixtyfourthPoints * 64;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = (short)dotProduct;
  }
  
  #endif /*LV_HAVE_AVX512F*/
@@ -367,68 +391,73 @@ static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result, const  fl
  
  #ifdef LV_HAVE_SSE
  
-static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result, const  float* input, const  float* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 a0Val, a1Val, a2Val, a3Val;
-  __m128 b0Val, b1Val, b2Val, b3Val;
-  __m128 c0Val, c1Val, c2Val, c3Val;
-
-  __m128 dotProdVal0 = _mm_setzero_ps();
-  __m128 dotProdVal1 = _mm_setzero_ps();
-  __m128 dotProdVal2 = _mm_setzero_ps();
-  __m128 dotProdVal3 = _mm_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    a0Val = _mm_loadu_ps(aPtr);
-    a1Val = _mm_loadu_ps(aPtr+4);
-    a2Val = _mm_loadu_ps(aPtr+8);
-    a3Val = _mm_loadu_ps(aPtr+12);
-    b0Val = _mm_loadu_ps(bPtr);
-    b1Val = _mm_loadu_ps(bPtr+4);
-    b2Val = _mm_loadu_ps(bPtr+8);
-    b3Val = _mm_loadu_ps(bPtr+12);
-
-    c0Val = _mm_mul_ps(a0Val, b0Val);
-    c1Val = _mm_mul_ps(a1Val, b1Val);
-    c2Val = _mm_mul_ps(a2Val, b2Val);
-    c3Val = _mm_mul_ps(a3Val, b3Val);
-
-    dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
-    dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
-    dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
-    dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
-
-    aPtr += 16;
-    bPtr += 16;
-  }
-
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-
-  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-
-  number = sixteenthPoints*16;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = (short)dotProduct;
+static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result,
+                                                  const float* input,
+                                                  const float* taps,
+                                                  unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m128 a0Val, a1Val, a2Val, a3Val;
+    __m128 b0Val, b1Val, b2Val, b3Val;
+    __m128 c0Val, c1Val, c2Val, c3Val;
+
+    __m128 dotProdVal0 = _mm_setzero_ps();
+    __m128 dotProdVal1 = _mm_setzero_ps();
+    __m128 dotProdVal2 = _mm_setzero_ps();
+    __m128 dotProdVal3 = _mm_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm_loadu_ps(aPtr);
+        a1Val = _mm_loadu_ps(aPtr + 4);
+        a2Val = _mm_loadu_ps(aPtr + 8);
+        a3Val = _mm_loadu_ps(aPtr + 12);
+        b0Val = _mm_loadu_ps(bPtr);
+        b1Val = _mm_loadu_ps(bPtr + 4);
+        b2Val = _mm_loadu_ps(bPtr + 8);
+        b3Val = _mm_loadu_ps(bPtr + 12);
+
+        c0Val = _mm_mul_ps(a0Val, b0Val);
+        c1Val = _mm_mul_ps(a1Val, b1Val);
+        c2Val = _mm_mul_ps(a2Val, b2Val);
+        c3Val = _mm_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 16;
+        bPtr += 16;
+    }
+
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+    _mm_store_ps(dotProductVector,
+                 dotProdVal0); // Store the results back into the dot product vector
+
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = (short)dotProduct;
  }
  
  #endif /*LV_HAVE_SSE*/
@@ -436,66 +465,71 @@ static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result, const  float*
  
  #if LV_HAVE_AVX2 && LV_HAVE_FMA
  
-static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result, const  float* input, const  float* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int thirtysecondPoints = num_points / 32;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m256 a0Val, a1Val, a2Val, a3Val;
-  __m256 b0Val, b1Val, b2Val, b3Val;
-
-  __m256 dotProdVal0 = _mm256_setzero_ps();
-  __m256 dotProdVal1 = _mm256_setzero_ps();
-  __m256 dotProdVal2 = _mm256_setzero_ps();
-  __m256 dotProdVal3 = _mm256_setzero_ps();
-
-  for(;number < thirtysecondPoints; number++){
-
-    a0Val = _mm256_loadu_ps(aPtr);
-    a1Val = _mm256_loadu_ps(aPtr+8);
-    a2Val = _mm256_loadu_ps(aPtr+16);
-    a3Val = _mm256_loadu_ps(aPtr+24);
-    b0Val = _mm256_loadu_ps(bPtr);
-    b1Val = _mm256_loadu_ps(bPtr+8);
-    b2Val = _mm256_loadu_ps(bPtr+16);
-    b3Val = _mm256_loadu_ps(bPtr+24);
-
-    dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
-    dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
-    dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
-    dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
-
-    aPtr += 32;
-    bPtr += 32;
-  }
-
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
-  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-  dotProduct += dotProductVector[4];
-  dotProduct += dotProductVector[5];
-  dotProduct += dotProductVector[6];
-  dotProduct += dotProductVector[7];
-
-  number = thirtysecondPoints*32;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = (short)dotProduct;
+static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result,
+                                                       const float* input,
+                                                       const float* taps,
+                                                       unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int thirtysecondPoints = num_points / 32;
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val, b1Val, b2Val, b3Val;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 dotProdVal2 = _mm256_setzero_ps();
+    __m256 dotProdVal3 = _mm256_setzero_ps();
+
+    for (; number < thirtysecondPoints; number++) {
+
+        a0Val = _mm256_loadu_ps(aPtr);
+        a1Val = _mm256_loadu_ps(aPtr + 8);
+        a2Val = _mm256_loadu_ps(aPtr + 16);
+        a3Val = _mm256_loadu_ps(aPtr + 24);
+        b0Val = _mm256_loadu_ps(bPtr);
+        b1Val = _mm256_loadu_ps(bPtr + 8);
+        b2Val = _mm256_loadu_ps(bPtr + 16);
+        b3Val = _mm256_loadu_ps(bPtr + 24);
+
+        dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
+        dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
+        dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
+        dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
+
+        aPtr += 32;
+        bPtr += 32;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+    dotProduct += dotProductVector[4];
+    dotProduct += dotProductVector[5];
+    dotProduct += dotProductVector[6];
+    dotProduct += dotProductVector[7];
+
+    number = thirtysecondPoints * 32;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = (short)dotProduct;
  }
  
  #endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
@@ -503,146 +537,156 @@ static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result, const  f
  
  #ifdef LV_HAVE_AVX
  
-static inline void volk_32f_x2_dot_prod_16i_u_avx(int16_t* result, const  float* input, const  float* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int thirtysecondPoints = num_points / 32;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m256 a0Val, a1Val, a2Val, a3Val;
-  __m256 b0Val, b1Val, b2Val, b3Val;
-  __m256 c0Val, c1Val, c2Val, c3Val;
-
-  __m256 dotProdVal0 = _mm256_setzero_ps();
-  __m256 dotProdVal1 = _mm256_setzero_ps();
-  __m256 dotProdVal2 = _mm256_setzero_ps();
-  __m256 dotProdVal3 = _mm256_setzero_ps();
-
-  for(;number < thirtysecondPoints; number++){
-
-    a0Val = _mm256_loadu_ps(aPtr);
-    a1Val = _mm256_loadu_ps(aPtr+8);
-    a2Val = _mm256_loadu_ps(aPtr+16);
-    a3Val = _mm256_loadu_ps(aPtr+24);
-    b0Val = _mm256_loadu_ps(bPtr);
-    b1Val = _mm256_loadu_ps(bPtr+8);
-    b2Val = _mm256_loadu_ps(bPtr+16);
-    b3Val = _mm256_loadu_ps(bPtr+24);
-
-    c0Val = _mm256_mul_ps(a0Val, b0Val);
-    c1Val = _mm256_mul_ps(a1Val, b1Val);
-    c2Val = _mm256_mul_ps(a2Val, b2Val);
-    c3Val = _mm256_mul_ps(a3Val, b3Val);
-
-    dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
-    dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
-    dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
-    dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
-
-    aPtr += 32;
-    bPtr += 32;
-  }
-
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
-  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-  dotProduct += dotProductVector[4];
-  dotProduct += dotProductVector[5];
-  dotProduct += dotProductVector[6];
-  dotProduct += dotProductVector[7];
-
-  number = thirtysecondPoints*32;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = (short)dotProduct;
+static inline void volk_32f_x2_dot_prod_16i_u_avx(int16_t* result,
+                                                  const float* input,
+                                                  const float* taps,
+                                                  unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int thirtysecondPoints = num_points / 32;
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val, b1Val, b2Val, b3Val;
+    __m256 c0Val, c1Val, c2Val, c3Val;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 dotProdVal2 = _mm256_setzero_ps();
+    __m256 dotProdVal3 = _mm256_setzero_ps();
+
+    for (; number < thirtysecondPoints; number++) {
+
+        a0Val = _mm256_loadu_ps(aPtr);
+        a1Val = _mm256_loadu_ps(aPtr + 8);
+        a2Val = _mm256_loadu_ps(aPtr + 16);
+        a3Val = _mm256_loadu_ps(aPtr + 24);
+        b0Val = _mm256_loadu_ps(bPtr);
+        b1Val = _mm256_loadu_ps(bPtr + 8);
+        b2Val = _mm256_loadu_ps(bPtr + 16);
+        b3Val = _mm256_loadu_ps(bPtr + 24);
+
+        c0Val = _mm256_mul_ps(a0Val, b0Val);
+        c1Val = _mm256_mul_ps(a1Val, b1Val);
+        c2Val = _mm256_mul_ps(a2Val, b2Val);
+        c3Val = _mm256_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 32;
+        bPtr += 32;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+    dotProduct += dotProductVector[4];
+    dotProduct += dotProductVector[5];
+    dotProduct += dotProductVector[6];
+    dotProduct += dotProductVector[7];
+
+    number = thirtysecondPoints * 32;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = (short)dotProduct;
  }
  
  #endif /*LV_HAVE_AVX*/
  
  #ifdef LV_HAVE_AVX512F
  
-static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result, const  float* input, const  float* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int sixtyfourthPoints = num_points / 64;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m512 a0Val, a1Val, a2Val, a3Val;
-  __m512 b0Val, b1Val, b2Val, b3Val;
-
-  __m512 dotProdVal0 = _mm512_setzero_ps();
-  __m512 dotProdVal1 = _mm512_setzero_ps();
-  __m512 dotProdVal2 = _mm512_setzero_ps();
-  __m512 dotProdVal3 = _mm512_setzero_ps();
-
-  for(;number < sixtyfourthPoints; number++){
-
-    a0Val = _mm512_loadu_ps(aPtr);
-    a1Val = _mm512_loadu_ps(aPtr+16);
-    a2Val = _mm512_loadu_ps(aPtr+32);
-    a3Val = _mm512_loadu_ps(aPtr+48);
-    b0Val = _mm512_loadu_ps(bPtr);
-    b1Val = _mm512_loadu_ps(bPtr+16);
-    b2Val = _mm512_loadu_ps(bPtr+32);
-    b3Val = _mm512_loadu_ps(bPtr+48);
-
-    dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
-    dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
-    dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
-    dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
-
-    aPtr += 64;
-    bPtr += 64;
-  }
-
-  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
-
-  _mm512_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-  dotProduct += dotProductVector[4];
-  dotProduct += dotProductVector[5];
-  dotProduct += dotProductVector[6];
-  dotProduct += dotProductVector[7];
-  dotProduct += dotProductVector[8];
-  dotProduct += dotProductVector[9];
-  dotProduct += dotProductVector[10];
-  dotProduct += dotProductVector[11];
-  dotProduct += dotProductVector[12];
-  dotProduct += dotProductVector[13];
-  dotProduct += dotProductVector[14];
-  dotProduct += dotProductVector[15];
-
-  number = sixtyfourthPoints*64;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = (short)dotProduct;
+static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result,
+                                                      const float* input,
+                                                      const float* taps,
+                                                      unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixtyfourthPoints = num_points / 64;
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m512 a0Val, a1Val, a2Val, a3Val;
+    __m512 b0Val, b1Val, b2Val, b3Val;
+
+    __m512 dotProdVal0 = _mm512_setzero_ps();
+    __m512 dotProdVal1 = _mm512_setzero_ps();
+    __m512 dotProdVal2 = _mm512_setzero_ps();
+    __m512 dotProdVal3 = _mm512_setzero_ps();
+
+    for (; number < sixtyfourthPoints; number++) {
+
+        a0Val = _mm512_loadu_ps(aPtr);
+        a1Val = _mm512_loadu_ps(aPtr + 16);
+        a2Val = _mm512_loadu_ps(aPtr + 32);
+        a3Val = _mm512_loadu_ps(aPtr + 48);
+        b0Val = _mm512_loadu_ps(bPtr);
+        b1Val = _mm512_loadu_ps(bPtr + 16);
+        b2Val = _mm512_loadu_ps(bPtr + 32);
+        b3Val = _mm512_loadu_ps(bPtr + 48);
+
+        dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
+        dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
+        dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
+        dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
+
+        aPtr += 64;
+        bPtr += 64;
+    }
+
+    dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
+
+    _mm512_storeu_ps(dotProductVector,
+                     dotProdVal0); // Store the results back into the dot product vector
+
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+    dotProduct += dotProductVector[4];
+    dotProduct += dotProductVector[5];
+    dotProduct += dotProductVector[6];
+    dotProduct += dotProductVector[7];
+    dotProduct += dotProductVector[8];
+    dotProduct += dotProductVector[9];
+    dotProduct += dotProductVector[10];
+    dotProduct += dotProductVector[11];
+    dotProduct += dotProductVector[12];
+    dotProduct += dotProductVector[13];
+    dotProduct += dotProductVector[14];
+    dotProduct += dotProductVector[15];
+
+    number = sixtyfourthPoints * 64;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = (short)dotProduct;
  }
  
  #endif /*LV_HAVE_AVX512F*/
diff --git a/kernels/volk/volk_32f_x2_dot_prod_32f.h b/kernels/volk/volk_32f_x2_dot_prod_32f.h

index ea0f7ba94e77ed294c70c3fc6d0f4d4f8bcd8202..7854031631eeda33da61550cff241d42668abd2d 100644 (file)
--- a/kernels/volk/volk_32f_x2_dot_prod_32f.h
+++ b/kernels/volk/volk_32f_x2_dot_prod_32f.h
@@ -33,8 +33,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_x2_dot_prod_32f(float* result, const float* input, const float* taps, unsigned int num_points)
- * \endcode
+ * void volk_32f_x2_dot_prod_32f(float* result, const float* input, const float* taps,
+ * unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li input: vector of floats.
@@ -45,10 +45,8 @@
   * \li result: pointer to a float value to hold the dot product result.
   *
   * \b Example
- * Take the dot product of an increasing vector and a vector of ones. The result is the sum of integers (0,9).
- * \code
- *   int N = 10;
- *   unsigned int alignment = volk_get_alignment();
+ * Take the dot product of an increasing vector and a vector of ones. The result is the
+ * sum of integers (0,9). \code int N = 10; unsigned int alignment = volk_get_alignment();
   *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
   *   float* ones = (float*)volk_malloc(sizeof(float)*N, alignment);
   *   float* out = (float*)volk_malloc(sizeof(float)*1, alignment);
@@ -73,25 +71,29 @@
  #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
  #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
  
+#include <stdio.h>
  #include <volk/volk_common.h>
-#include<stdio.h>
  
  
  #ifdef LV_HAVE_GENERIC
  
  
-static inline void volk_32f_x2_dot_prod_32f_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_generic(float* result,
+                                                    const float* input,
+                                                    const float* taps,
+                                                    unsigned int num_points)
+{
  
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr=  taps;
-  unsigned int number = 0;
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+    unsigned int number = 0;
  
-  for(number = 0; number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
+    for (number = 0; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
  
-  *result = dotProduct;
+    *result = dotProduct;
  }
  
  #endif /*LV_HAVE_GENERIC*/
@@ -100,69 +102,73 @@ static inline void volk_32f_x2_dot_prod_32f_generic(float * result, const float
  #ifdef LV_HAVE_SSE
  
  
-static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
+static inline void volk_32f_x2_dot_prod_32f_u_sse(float* result,
+                                                  const float* input,
+                                                  const float* taps,
+                                                  unsigned int num_points)
+{
  
-  __m128 a0Val, a1Val, a2Val, a3Val;
-  __m128 b0Val, b1Val, b2Val, b3Val;
-  __m128 c0Val, c1Val, c2Val, c3Val;
-
-  __m128 dotProdVal0 = _mm_setzero_ps();
-  __m128 dotProdVal1 = _mm_setzero_ps();
-  __m128 dotProdVal2 = _mm_setzero_ps();
-  __m128 dotProdVal3 = _mm_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    a0Val = _mm_loadu_ps(aPtr);
-    a1Val = _mm_loadu_ps(aPtr+4);
-    a2Val = _mm_loadu_ps(aPtr+8);
-    a3Val = _mm_loadu_ps(aPtr+12);
-    b0Val = _mm_loadu_ps(bPtr);
-    b1Val = _mm_loadu_ps(bPtr+4);
-    b2Val = _mm_loadu_ps(bPtr+8);
-    b3Val = _mm_loadu_ps(bPtr+12);
-
-    c0Val = _mm_mul_ps(a0Val, b0Val);
-    c1Val = _mm_mul_ps(a1Val, b1Val);
-    c2Val = _mm_mul_ps(a2Val, b2Val);
-    c3Val = _mm_mul_ps(a3Val, b3Val);
-
-    dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
-    dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
-    dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
-    dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-    aPtr += 16;
-    bPtr += 16;
-  }
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m128 a0Val, a1Val, a2Val, a3Val;
+    __m128 b0Val, b1Val, b2Val, b3Val;
+    __m128 c0Val, c1Val, c2Val, c3Val;
+
+    __m128 dotProdVal0 = _mm_setzero_ps();
+    __m128 dotProdVal1 = _mm_setzero_ps();
+    __m128 dotProdVal2 = _mm_setzero_ps();
+    __m128 dotProdVal3 = _mm_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm_loadu_ps(aPtr);
+        a1Val = _mm_loadu_ps(aPtr + 4);
+        a2Val = _mm_loadu_ps(aPtr + 8);
+        a3Val = _mm_loadu_ps(aPtr + 12);
+        b0Val = _mm_loadu_ps(bPtr);
+        b1Val = _mm_loadu_ps(bPtr + 4);
+        b2Val = _mm_loadu_ps(bPtr + 8);
+        b3Val = _mm_loadu_ps(bPtr + 12);
+
+        c0Val = _mm_mul_ps(a0Val, b0Val);
+        c1Val = _mm_mul_ps(a1Val, b1Val);
+        c2Val = _mm_mul_ps(a2Val, b2Val);
+        c3Val = _mm_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
  
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+        aPtr += 16;
+        bPtr += 16;
+    }
  
-  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
  
-  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+    __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
  
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
+    _mm_store_ps(dotProductVector,
+                 dotProdVal0); // Store the results back into the dot product vector
  
-  number = sixteenthPoints*16;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
  
-  *result = dotProduct;
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
  
+    *result = dotProduct;
  }
  
  #endif /*LV_HAVE_SSE*/
@@ -171,127 +177,145 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const  float*
  
  #include <pmmintrin.h>
  
-static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 a0Val, a1Val, a2Val, a3Val;
-  __m128 b0Val, b1Val, b2Val, b3Val;
-  __m128 c0Val, c1Val, c2Val, c3Val;
-
-  __m128 dotProdVal0 = _mm_setzero_ps();
-  __m128 dotProdVal1 = _mm_setzero_ps();
-  __m128 dotProdVal2 = _mm_setzero_ps();
-  __m128 dotProdVal3 = _mm_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    a0Val = _mm_loadu_ps(aPtr);
-    a1Val = _mm_loadu_ps(aPtr+4);
-    a2Val = _mm_loadu_ps(aPtr+8);
-    a3Val = _mm_loadu_ps(aPtr+12);
-    b0Val = _mm_loadu_ps(bPtr);
-    b1Val = _mm_loadu_ps(bPtr+4);
-    b2Val = _mm_loadu_ps(bPtr+8);
-    b3Val = _mm_loadu_ps(bPtr+12);
-
-    c0Val = _mm_mul_ps(a0Val, b0Val);
-    c1Val = _mm_mul_ps(a1Val, b1Val);
-    c2Val = _mm_mul_ps(a2Val, b2Val);
-    c3Val = _mm_mul_ps(a3Val, b3Val);
-
-    dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
-    dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
-    dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
-    dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
-
-    aPtr += 16;
-    bPtr += 16;
-  }
-
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-
-  number = sixteenthPoints*16;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-}
-
-#endif /*LV_HAVE_SSE3*/
+static inline void volk_32f_x2_dot_prod_32f_u_sse3(float* result,
+                                                   const float* input,
+                                                   const float* taps,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-#ifdef LV_HAVE_SSE4_1
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m128 a0Val, a1Val, a2Val, a3Val;
+    __m128 b0Val, b1Val, b2Val, b3Val;
+    __m128 c0Val, c1Val, c2Val, c3Val;
+
+    __m128 dotProdVal0 = _mm_setzero_ps();
+    __m128 dotProdVal1 = _mm_setzero_ps();
+    __m128 dotProdVal2 = _mm_setzero_ps();
+    __m128 dotProdVal3 = _mm_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm_loadu_ps(aPtr);
+        a1Val = _mm_loadu_ps(aPtr + 4);
+        a2Val = _mm_loadu_ps(aPtr + 8);
+        a3Val = _mm_loadu_ps(aPtr + 12);
+        b0Val = _mm_loadu_ps(bPtr);
+        b1Val = _mm_loadu_ps(bPtr + 4);
+        b2Val = _mm_loadu_ps(bPtr + 8);
+        b3Val = _mm_loadu_ps(bPtr + 12);
+
+        c0Val = _mm_mul_ps(a0Val, b0Val);
+        c1Val = _mm_mul_ps(a1Val, b1Val);
+        c2Val = _mm_mul_ps(a2Val, b2Val);
+        c3Val = _mm_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+        dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+        dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+        dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
  
-#include <smmintrin.h>
+        aPtr += 16;
+        bPtr += 16;
+    }
  
-static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
  
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
+    __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+    _mm_store_ps(dotProductVector,
+                 dotProdVal0); // Store the results back into the dot product vector
  
-  __m128 aVal1, bVal1, cVal1;
-  __m128 aVal2, bVal2, cVal2;
-  __m128 aVal3, bVal3, cVal3;
-  __m128 aVal4, bVal4, cVal4;
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
  
-  __m128 dotProdVal = _mm_setzero_ps();
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
  
-  for(;number < sixteenthPoints; number++){
+    *result = dotProduct;
+}
  
-    aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
-    aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
-    aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
-    aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
+#endif /*LV_HAVE_SSE3*/
  
-    bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
-    bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
-    bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
-    bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
+#ifdef LV_HAVE_SSE4_1
  
-    cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
-    cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
-    cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
-    cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+#include <smmintrin.h>
  
-    cVal1 = _mm_or_ps(cVal1, cVal2);
-    cVal3 = _mm_or_ps(cVal3, cVal4);
-    cVal1 = _mm_or_ps(cVal1, cVal3);
+static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float* result,
+                                                     const float* input,
+                                                     const float* taps,
+                                                     unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-    dotProdVal = _mm_add_ps(dotProdVal, cVal1);
-  }
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m128 aVal1, bVal1, cVal1;
+    __m128 aVal2, bVal2, cVal2;
+    __m128 aVal3, bVal3, cVal3;
+    __m128 aVal4, bVal4, cVal4;
+
+    __m128 dotProdVal = _mm_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        aVal1 = _mm_loadu_ps(aPtr);
+        aPtr += 4;
+        aVal2 = _mm_loadu_ps(aPtr);
+        aPtr += 4;
+        aVal3 = _mm_loadu_ps(aPtr);
+        aPtr += 4;
+        aVal4 = _mm_loadu_ps(aPtr);
+        aPtr += 4;
+
+        bVal1 = _mm_loadu_ps(bPtr);
+        bPtr += 4;
+        bVal2 = _mm_loadu_ps(bPtr);
+        bPtr += 4;
+        bVal3 = _mm_loadu_ps(bPtr);
+        bPtr += 4;
+        bVal4 = _mm_loadu_ps(bPtr);
+        bPtr += 4;
+
+        cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+        cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+        cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+        cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+        cVal1 = _mm_or_ps(cVal1, cVal2);
+        cVal3 = _mm_or_ps(cVal3, cVal4);
+        cVal1 = _mm_or_ps(cVal1, cVal3);
+
+        dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+    }
  
-  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+    __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+    _mm_store_ps(dotProductVector,
+                 dotProdVal); // Store the results back into the dot product vector
  
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
  
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
  
-  *result = dotProduct;
+    *result = dotProduct;
  }
  
  #endif /*LV_HAVE_SSE4_1*/
@@ -300,147 +324,154 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float
  
  #include <immintrin.h>
  
-static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const  float* input, const  float* taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_u_avx(float* result,
+                                                  const float* input,
+                                                  const float* taps,
+                                                  unsigned int num_points)
+{
  
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-  __m256 a0Val, a1Val;
-  __m256 b0Val, b1Val;
-  __m256 c0Val, c1Val;
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
  
-  __m256 dotProdVal0 = _mm256_setzero_ps();
-  __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 a0Val, a1Val;
+    __m256 b0Val, b1Val;
+    __m256 c0Val, c1Val;
  
-  for(;number < sixteenthPoints; number++){
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
  
-    a0Val = _mm256_loadu_ps(aPtr);
-    a1Val = _mm256_loadu_ps(aPtr+8);
-    b0Val = _mm256_loadu_ps(bPtr);
-    b1Val = _mm256_loadu_ps(bPtr+8);
+    for (; number < sixteenthPoints; number++) {
  
-    c0Val = _mm256_mul_ps(a0Val, b0Val);
-    c1Val = _mm256_mul_ps(a1Val, b1Val);
+        a0Val = _mm256_loadu_ps(aPtr);
+        a1Val = _mm256_loadu_ps(aPtr + 8);
+        b0Val = _mm256_loadu_ps(bPtr);
+        b1Val = _mm256_loadu_ps(bPtr + 8);
  
-    dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
-    dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+        c0Val = _mm256_mul_ps(a0Val, b0Val);
+        c1Val = _mm256_mul_ps(a1Val, b1Val);
  
-    aPtr += 16;
-    bPtr += 16;
-  }
+        dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
  
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+        aPtr += 16;
+        bPtr += 16;
+    }
  
-  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
  
-  _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
  
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-  dotProduct += dotProductVector[4];
-  dotProduct += dotProductVector[5];
-  dotProduct += dotProductVector[6];
-  dotProduct += dotProductVector[7];
+    _mm256_storeu_ps(dotProductVector,
+                     dotProdVal0); // Store the results back into the dot product vector
  
-  number = sixteenthPoints*16;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+    dotProduct += dotProductVector[4];
+    dotProduct += dotProductVector[5];
+    dotProduct += dotProductVector[6];
+    dotProduct += dotProductVector[7];
  
-  *result = dotProduct;
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
  
+    *result = dotProduct;
  }
  
  #endif /*LV_HAVE_AVX*/
  
  #if LV_HAVE_AVX2 && LV_HAVE_FMA
  #include <immintrin.h>
-static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(float * result, const float * input, const float* taps, unsigned int num_points){
-  unsigned int number;
-  const unsigned int eighthPoints = num_points / 8;
+static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(float* result,
+                                                       const float* input,
+                                                       const float* taps,
+                                                       unsigned int num_points)
+{
+    unsigned int number;
+    const unsigned int eighthPoints = num_points / 8;
  
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m256 dotProdVal = _mm256_setzero_ps();
-  __m256 aVal1, bVal1;
+    const float* aPtr = input;
+    const float* bPtr = taps;
  
-  for (number = 0; number < eighthPoints; number++ ) {
+    __m256 dotProdVal = _mm256_setzero_ps();
+    __m256 aVal1, bVal1;
  
-    aVal1 = _mm256_loadu_ps(aPtr);
-    bVal1 = _mm256_loadu_ps(bPtr);
-    aPtr += 8;
-    bPtr += 8;
+    for (number = 0; number < eighthPoints; number++) {
  
-    dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
-  }
+        aVal1 = _mm256_loadu_ps(aPtr);
+        bVal1 = _mm256_loadu_ps(bPtr);
+        aPtr += 8;
+        bPtr += 8;
  
-  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-  _mm256_storeu_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
-  _mm256_zeroupper();
+        dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
+    }
  
-  float dotProduct =
-    dotProductVector[0] + dotProductVector[1] +
-    dotProductVector[2] + dotProductVector[3] +
-    dotProductVector[4] + dotProductVector[5] +
-    dotProductVector[6] + dotProductVector[7];
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+    _mm256_storeu_ps(dotProductVector,
+                     dotProdVal); // Store the results back into the dot product vector
+    _mm256_zeroupper();
  
-  for(number = eighthPoints * 8; number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
+    float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
+                       dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
+                       dotProductVector[6] + dotProductVector[7];
  
-  *result = dotProduct;
+    for (number = eighthPoints * 8; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
  
+    *result = dotProduct;
  }
  #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
  
  #if LV_HAVE_AVX512F
  #include <immintrin.h>
-static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float * result, const float * input, const float* taps, unsigned int num_points){
-  unsigned int number;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  const float* aPtr = input;
-  const float* bPtr = taps;
+static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float* result,
+                                                      const float* input,
+                                                      const float* taps,
+                                                      unsigned int num_points)
+{
+    unsigned int number;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-  __m512 dotProdVal = _mm512_setzero_ps();
-  __m512 aVal1, bVal1;
+    const float* aPtr = input;
+    const float* bPtr = taps;
  
-  for (number = 0; number < sixteenthPoints; number++ ) {
+    __m512 dotProdVal = _mm512_setzero_ps();
+    __m512 aVal1, bVal1;
  
-    aVal1 = _mm512_loadu_ps(aPtr);
-    bVal1 = _mm512_loadu_ps(bPtr);
-    aPtr += 16;
-    bPtr += 16;
+    for (number = 0; number < sixteenthPoints; number++) {
  
-    dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
-  }
+        aVal1 = _mm512_loadu_ps(aPtr);
+        bVal1 = _mm512_loadu_ps(bPtr);
+        aPtr += 16;
+        bPtr += 16;
  
-  __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
-  _mm512_storeu_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+        dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
+    }
  
-  float dotProduct =
-    dotProductVector[0] + dotProductVector[1] +
-    dotProductVector[2] + dotProductVector[3] +
-    dotProductVector[4] + dotProductVector[5] +
-    dotProductVector[6] + dotProductVector[7] +
-    dotProductVector[8] + dotProductVector[9] +
-    dotProductVector[10] + dotProductVector[11] +
-    dotProductVector[12] + dotProductVector[13] +
-    dotProductVector[14] + dotProductVector[15];
+    __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
+    _mm512_storeu_ps(dotProductVector,
+                     dotProdVal); // Store the results back into the dot product vector
  
-  for(number = sixteenthPoints * 16; number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
+    float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
+                       dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
+                       dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
+                       dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
+                       dotProductVector[12] + dotProductVector[13] +
+                       dotProductVector[14] + dotProductVector[15];
  
-  *result = dotProduct;
+    for (number = sixteenthPoints * 16; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
  
+    *result = dotProduct;
  }
  #endif /* LV_HAVE_AVX512F */
  
@@ -449,25 +480,29 @@ static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float * result, const floa
  #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
  #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
  
+#include <stdio.h>
  #include <volk/volk_common.h>
-#include<stdio.h>
  
  
  #ifdef LV_HAVE_GENERIC
  
  
-static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_a_generic(float* result,
+                                                      const float* input,
+                                                      const float* taps,
+                                                      unsigned int num_points)
+{
  
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr=  taps;
-  unsigned int number = 0;
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+    unsigned int number = 0;
  
-  for(number = 0; number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
+    for (number = 0; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
  
-  *result = dotProduct;
+    *result = dotProduct;
  }
  
  #endif /*LV_HAVE_GENERIC*/
@@ -476,69 +511,73 @@ static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const floa
  #ifdef LV_HAVE_SSE
  
  
-static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 a0Val, a1Val, a2Val, a3Val;
-  __m128 b0Val, b1Val, b2Val, b3Val;
-  __m128 c0Val, c1Val, c2Val, c3Val;
-
-  __m128 dotProdVal0 = _mm_setzero_ps();
-  __m128 dotProdVal1 = _mm_setzero_ps();
-  __m128 dotProdVal2 = _mm_setzero_ps();
-  __m128 dotProdVal3 = _mm_setzero_ps();
+static inline void volk_32f_x2_dot_prod_32f_a_sse(float* result,
+                                                  const float* input,
+                                                  const float* taps,
+                                                  unsigned int num_points)
+{
  
-  for(;number < sixteenthPoints; number++){
-
-    a0Val = _mm_load_ps(aPtr);
-    a1Val = _mm_load_ps(aPtr+4);
-    a2Val = _mm_load_ps(aPtr+8);
-    a3Val = _mm_load_ps(aPtr+12);
-    b0Val = _mm_load_ps(bPtr);
-    b1Val = _mm_load_ps(bPtr+4);
-    b2Val = _mm_load_ps(bPtr+8);
-    b3Val = _mm_load_ps(bPtr+12);
-
-    c0Val = _mm_mul_ps(a0Val, b0Val);
-    c1Val = _mm_mul_ps(a1Val, b1Val);
-    c2Val = _mm_mul_ps(a2Val, b2Val);
-    c3Val = _mm_mul_ps(a3Val, b3Val);
-
-    dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
-    dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
-    dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
-    dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-    aPtr += 16;
-    bPtr += 16;
-  }
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m128 a0Val, a1Val, a2Val, a3Val;
+    __m128 b0Val, b1Val, b2Val, b3Val;
+    __m128 c0Val, c1Val, c2Val, c3Val;
+
+    __m128 dotProdVal0 = _mm_setzero_ps();
+    __m128 dotProdVal1 = _mm_setzero_ps();
+    __m128 dotProdVal2 = _mm_setzero_ps();
+    __m128 dotProdVal3 = _mm_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm_load_ps(aPtr);
+        a1Val = _mm_load_ps(aPtr + 4);
+        a2Val = _mm_load_ps(aPtr + 8);
+        a3Val = _mm_load_ps(aPtr + 12);
+        b0Val = _mm_load_ps(bPtr);
+        b1Val = _mm_load_ps(bPtr + 4);
+        b2Val = _mm_load_ps(bPtr + 8);
+        b3Val = _mm_load_ps(bPtr + 12);
+
+        c0Val = _mm_mul_ps(a0Val, b0Val);
+        c1Val = _mm_mul_ps(a1Val, b1Val);
+        c2Val = _mm_mul_ps(a2Val, b2Val);
+        c3Val = _mm_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
  
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+        aPtr += 16;
+        bPtr += 16;
+    }
  
-  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
  
-  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+    __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
  
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
+    _mm_store_ps(dotProductVector,
+                 dotProdVal0); // Store the results back into the dot product vector
  
-  number = sixteenthPoints*16;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
  
-  *result = dotProduct;
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
  
+    *result = dotProduct;
  }
  
  #endif /*LV_HAVE_SSE*/
@@ -547,127 +586,145 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const  float*
  
  #include <pmmintrin.h>
  
-static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 a0Val, a1Val, a2Val, a3Val;
-  __m128 b0Val, b1Val, b2Val, b3Val;
-  __m128 c0Val, c1Val, c2Val, c3Val;
-
-  __m128 dotProdVal0 = _mm_setzero_ps();
-  __m128 dotProdVal1 = _mm_setzero_ps();
-  __m128 dotProdVal2 = _mm_setzero_ps();
-  __m128 dotProdVal3 = _mm_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    a0Val = _mm_load_ps(aPtr);
-    a1Val = _mm_load_ps(aPtr+4);
-    a2Val = _mm_load_ps(aPtr+8);
-    a3Val = _mm_load_ps(aPtr+12);
-    b0Val = _mm_load_ps(bPtr);
-    b1Val = _mm_load_ps(bPtr+4);
-    b2Val = _mm_load_ps(bPtr+8);
-    b3Val = _mm_load_ps(bPtr+12);
-
-    c0Val = _mm_mul_ps(a0Val, b0Val);
-    c1Val = _mm_mul_ps(a1Val, b1Val);
-    c2Val = _mm_mul_ps(a2Val, b2Val);
-    c3Val = _mm_mul_ps(a3Val, b3Val);
-
-    dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
-    dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
-    dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
-    dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
-
-    aPtr += 16;
-    bPtr += 16;
-  }
-
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-
-  number = sixteenthPoints*16;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-}
-
-#endif /*LV_HAVE_SSE3*/
+static inline void volk_32f_x2_dot_prod_32f_a_sse3(float* result,
+                                                   const float* input,
+                                                   const float* taps,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-#ifdef LV_HAVE_SSE4_1
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m128 a0Val, a1Val, a2Val, a3Val;
+    __m128 b0Val, b1Val, b2Val, b3Val;
+    __m128 c0Val, c1Val, c2Val, c3Val;
+
+    __m128 dotProdVal0 = _mm_setzero_ps();
+    __m128 dotProdVal1 = _mm_setzero_ps();
+    __m128 dotProdVal2 = _mm_setzero_ps();
+    __m128 dotProdVal3 = _mm_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm_load_ps(aPtr);
+        a1Val = _mm_load_ps(aPtr + 4);
+        a2Val = _mm_load_ps(aPtr + 8);
+        a3Val = _mm_load_ps(aPtr + 12);
+        b0Val = _mm_load_ps(bPtr);
+        b1Val = _mm_load_ps(bPtr + 4);
+        b2Val = _mm_load_ps(bPtr + 8);
+        b3Val = _mm_load_ps(bPtr + 12);
+
+        c0Val = _mm_mul_ps(a0Val, b0Val);
+        c1Val = _mm_mul_ps(a1Val, b1Val);
+        c2Val = _mm_mul_ps(a2Val, b2Val);
+        c3Val = _mm_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+        dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+        dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+        dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
  
-#include <smmintrin.h>
+        aPtr += 16;
+        bPtr += 16;
+    }
  
-static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
  
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
+    __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+    _mm_store_ps(dotProductVector,
+                 dotProdVal0); // Store the results back into the dot product vector
  
-  __m128 aVal1, bVal1, cVal1;
-  __m128 aVal2, bVal2, cVal2;
-  __m128 aVal3, bVal3, cVal3;
-  __m128 aVal4, bVal4, cVal4;
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
  
-  __m128 dotProdVal = _mm_setzero_ps();
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
  
-  for(;number < sixteenthPoints; number++){
+    *result = dotProduct;
+}
  
-    aVal1 = _mm_load_ps(aPtr); aPtr += 4;
-    aVal2 = _mm_load_ps(aPtr); aPtr += 4;
-    aVal3 = _mm_load_ps(aPtr); aPtr += 4;
-    aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+#endif /*LV_HAVE_SSE3*/
  
-    bVal1 = _mm_load_ps(bPtr); bPtr += 4;
-    bVal2 = _mm_load_ps(bPtr); bPtr += 4;
-    bVal3 = _mm_load_ps(bPtr); bPtr += 4;
-    bVal4 = _mm_load_ps(bPtr); bPtr += 4;
+#ifdef LV_HAVE_SSE4_1
  
-    cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
-    cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
-    cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
-    cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+#include <smmintrin.h>
  
-    cVal1 = _mm_or_ps(cVal1, cVal2);
-    cVal3 = _mm_or_ps(cVal3, cVal4);
-    cVal1 = _mm_or_ps(cVal1, cVal3);
+static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float* result,
+                                                     const float* input,
+                                                     const float* taps,
+                                                     unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-    dotProdVal = _mm_add_ps(dotProdVal, cVal1);
-  }
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m128 aVal1, bVal1, cVal1;
+    __m128 aVal2, bVal2, cVal2;
+    __m128 aVal3, bVal3, cVal3;
+    __m128 aVal4, bVal4, cVal4;
+
+    __m128 dotProdVal = _mm_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        aVal1 = _mm_load_ps(aPtr);
+        aPtr += 4;
+        aVal2 = _mm_load_ps(aPtr);
+        aPtr += 4;
+        aVal3 = _mm_load_ps(aPtr);
+        aPtr += 4;
+        aVal4 = _mm_load_ps(aPtr);
+        aPtr += 4;
+
+        bVal1 = _mm_load_ps(bPtr);
+        bPtr += 4;
+        bVal2 = _mm_load_ps(bPtr);
+        bPtr += 4;
+        bVal3 = _mm_load_ps(bPtr);
+        bPtr += 4;
+        bVal4 = _mm_load_ps(bPtr);
+        bPtr += 4;
+
+        cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+        cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+        cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+        cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+        cVal1 = _mm_or_ps(cVal1, cVal2);
+        cVal3 = _mm_or_ps(cVal3, cVal4);
+        cVal1 = _mm_or_ps(cVal1, cVal3);
+
+        dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+    }
  
-  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+    __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+    _mm_store_ps(dotProductVector,
+                 dotProdVal); // Store the results back into the dot product vector
  
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
  
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
  
-  *result = dotProduct;
+    *result = dotProduct;
  }
  
  #endif /*LV_HAVE_SSE4_1*/
@@ -676,159 +733,170 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float
  
  #include <immintrin.h>
  
-static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const  float* input, const  float* taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_a_avx(float* result,
+                                                  const float* input,
+                                                  const float* taps,
+                                                  unsigned int num_points)
+{
  
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-  __m256 a0Val, a1Val;
-  __m256 b0Val, b1Val;
-  __m256 c0Val, c1Val;
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
  
-  __m256 dotProdVal0 = _mm256_setzero_ps();
-  __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 a0Val, a1Val;
+    __m256 b0Val, b1Val;
+    __m256 c0Val, c1Val;
  
-  for(;number < sixteenthPoints; number++){
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
  
-    a0Val = _mm256_load_ps(aPtr);
-    a1Val = _mm256_load_ps(aPtr+8);
-    b0Val = _mm256_load_ps(bPtr);
-    b1Val = _mm256_load_ps(bPtr+8);
+    for (; number < sixteenthPoints; number++) {
  
-    c0Val = _mm256_mul_ps(a0Val, b0Val);
-    c1Val = _mm256_mul_ps(a1Val, b1Val);
+        a0Val = _mm256_load_ps(aPtr);
+        a1Val = _mm256_load_ps(aPtr + 8);
+        b0Val = _mm256_load_ps(bPtr);
+        b1Val = _mm256_load_ps(bPtr + 8);
  
-    dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
-    dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+        c0Val = _mm256_mul_ps(a0Val, b0Val);
+        c1Val = _mm256_mul_ps(a1Val, b1Val);
  
-    aPtr += 16;
-    bPtr += 16;
-  }
+        dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
  
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+        aPtr += 16;
+        bPtr += 16;
+    }
  
-  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
  
-  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
  
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-  dotProduct += dotProductVector[4];
-  dotProduct += dotProductVector[5];
-  dotProduct += dotProductVector[6];
-  dotProduct += dotProductVector[7];
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
  
-  number = sixteenthPoints*16;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+    dotProduct += dotProductVector[4];
+    dotProduct += dotProductVector[5];
+    dotProduct += dotProductVector[6];
+    dotProduct += dotProductVector[7];
  
-  *result = dotProduct;
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
  
+    *result = dotProduct;
  }
  #endif /*LV_HAVE_AVX*/
  
  
  #if LV_HAVE_AVX2 && LV_HAVE_FMA
  #include <immintrin.h>
-static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(float * result, const float * input, const float* taps, unsigned int num_points){
-  unsigned int number;
-  const unsigned int eighthPoints = num_points / 8;
-
-  const float* aPtr = input;
-  const float* bPtr = taps;
+static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(float* result,
+                                                       const float* input,
+                                                       const float* taps,
+                                                       unsigned int num_points)
+{
+    unsigned int number;
+    const unsigned int eighthPoints = num_points / 8;
  
-  __m256 dotProdVal = _mm256_setzero_ps();
-  __m256 aVal1, bVal1;
+    const float* aPtr = input;
+    const float* bPtr = taps;
  
-  for (number = 0; number < eighthPoints; number++ ) {
+    __m256 dotProdVal = _mm256_setzero_ps();
+    __m256 aVal1, bVal1;
  
-    aVal1 = _mm256_load_ps(aPtr);
-    bVal1 = _mm256_load_ps(bPtr);
-    aPtr += 8;
-    bPtr += 8;
+    for (number = 0; number < eighthPoints; number++) {
  
-    dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
-  }
+        aVal1 = _mm256_load_ps(aPtr);
+        bVal1 = _mm256_load_ps(bPtr);
+        aPtr += 8;
+        bPtr += 8;
  
-  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-  _mm256_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
-  _mm256_zeroupper();
+        dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
+    }
  
-  float dotProduct =
-    dotProductVector[0] + dotProductVector[1] +
-    dotProductVector[2] + dotProductVector[3] +
-    dotProductVector[4] + dotProductVector[5] +
-    dotProductVector[6] + dotProductVector[7];
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal); // Store the results back into the dot product vector
+    _mm256_zeroupper();
  
-  for(number = eighthPoints * 8; number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
+    float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
+                       dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
+                       dotProductVector[6] + dotProductVector[7];
  
-  *result = dotProduct;
+    for (number = eighthPoints * 8; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
  
+    *result = dotProduct;
  }
  #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
  
  #if LV_HAVE_AVX512F
  #include <immintrin.h>
-static inline void volk_32f_x2_dot_prod_32f_a_avx512f(float * result, const float * input, const float* taps, unsigned int num_points){
-  unsigned int number;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  const float* aPtr = input;
-  const float* bPtr = taps;
+static inline void volk_32f_x2_dot_prod_32f_a_avx512f(float* result,
+                                                      const float* input,
+                                                      const float* taps,
+                                                      unsigned int num_points)
+{
+    unsigned int number;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-  __m512 dotProdVal = _mm512_setzero_ps();
-  __m512 aVal1, bVal1;
+    const float* aPtr = input;
+    const float* bPtr = taps;
  
-  for (number = 0; number < sixteenthPoints; number++ ) {
+    __m512 dotProdVal = _mm512_setzero_ps();
+    __m512 aVal1, bVal1;
  
-    aVal1 = _mm512_load_ps(aPtr);
-    bVal1 = _mm512_load_ps(bPtr);
-    aPtr += 16;
-    bPtr += 16;
+    for (number = 0; number < sixteenthPoints; number++) {
  
-    dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
-  }
+        aVal1 = _mm512_load_ps(aPtr);
+        bVal1 = _mm512_load_ps(bPtr);
+        aPtr += 16;
+        bPtr += 16;
  
-  __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
-  _mm512_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+        dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
+    }
  
-  float dotProduct =
-    dotProductVector[0] + dotProductVector[1] +
-    dotProductVector[2] + dotProductVector[3] +
-    dotProductVector[4] + dotProductVector[5] +
-    dotProductVector[6] + dotProductVector[7] +
-    dotProductVector[8] + dotProductVector[9] +
-    dotProductVector[10] + dotProductVector[11] +
-    dotProductVector[12] + dotProductVector[13] +
-    dotProductVector[14] + dotProductVector[15];
+    __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
+    _mm512_store_ps(dotProductVector,
+                    dotProdVal); // Store the results back into the dot product vector
  
-  for(number = sixteenthPoints * 16; number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
+    float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
+                       dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
+                       dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
+                       dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
+                       dotProductVector[12] + dotProductVector[13] +
+                       dotProductVector[14] + dotProductVector[15];
  
-  *result = dotProduct;
+    for (number = sixteenthPoints * 16; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
  
+    *result = dotProduct;
  }
  #endif /* LV_HAVE_AVX512F */
  
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float * input, const float * taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_neonopts(float* result,
+                                                     const float* input,
+                                                     const float* taps,
+                                                     unsigned int num_points)
+{
  
      unsigned int quarter_points = num_points / 16;
      float dotProduct = 0;
      const float* aPtr = input;
-    const float* bPtr=  taps;
+    const float* bPtr = taps;
      unsigned int number = 0;
  
      float32x4x4_t a_val, b_val, accumulator0;
@@ -838,7 +906,7 @@ static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float
      accumulator0.val[3] = vdupq_n_f32(0);
      // factor of 4 loop unroll with independent accumulators
      // uses 12 out of 16 neon q registers
-    for( number = 0; number < quarter_points; ++number) {
+    for (number = 0; number < quarter_points; ++number) {
          a_val = vld4q_f32(aPtr);
          b_val = vld4q_f32(bPtr);
          accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
@@ -855,8 +923,8 @@ static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float
      vst1q_f32(accumulator, accumulator0.val[0]);
      dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
  
-    for(number = quarter_points*16; number < num_points; number++){
-      dotProduct += ((*aPtr++) * (*bPtr++));
+    for (number = quarter_points * 16; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
      }
  
      *result = dotProduct;
@@ -865,26 +933,30 @@ static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float
  #endif
  
  
-
-
  #ifdef LV_HAVE_NEON
-static inline void volk_32f_x2_dot_prod_32f_neon(float * result, const float * input, const float * taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_neon(float* result,
+                                                 const float* input,
+                                                 const float* taps,
+                                                 unsigned int num_points)
+{
  
      unsigned int quarter_points = num_points / 8;
      float dotProduct = 0;
      const float* aPtr = input;
-    const float* bPtr=  taps;
+    const float* bPtr = taps;
      unsigned int number = 0;
  
      float32x4x2_t a_val, b_val, accumulator_val;
      accumulator_val.val[0] = vdupq_n_f32(0);
      accumulator_val.val[1] = vdupq_n_f32(0);
      // factor of 2 loop unroll with independent accumulators
-    for( number = 0; number < quarter_points; ++number) {
+    for (number = 0; number < quarter_points; ++number) {
          a_val = vld2q_f32(aPtr);
          b_val = vld2q_f32(bPtr);
-        accumulator_val.val[0] = vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
-        accumulator_val.val[1] = vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
+        accumulator_val.val[0] =
+            vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
+        accumulator_val.val[1] =
+            vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
          aPtr += 8;
          bPtr += 8;
      }
@@ -893,8 +965,8 @@ static inline void volk_32f_x2_dot_prod_32f_neon(float * result, const float * i
      vst1q_f32(accumulator, accumulator_val.val[0]);
      dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
  
-    for(number = quarter_points*8; number < num_points; number++){
-      dotProduct += ((*aPtr++) * (*bPtr++));
+    for (number = quarter_points * 8; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
      }
  
      *result = dotProduct;
@@ -903,11 +975,17 @@ static inline void volk_32f_x2_dot_prod_32f_neon(float * result, const float * i
  #endif /* LV_HAVE_NEON */
  
  #ifdef LV_HAVE_NEONV7
-extern void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+extern void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector,
+                                               const float* aVector,
+                                               const float* bVector,
+                                               unsigned int num_points);
  #endif /* LV_HAVE_NEONV7 */
  
  #ifdef LV_HAVE_NEONV7
-extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector,
+                                                    const float* aVector,
+                                                    const float* bVector,
+                                                    unsigned int num_points);
  #endif /* LV_HAVE_NEONV7 */
  
  #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
diff --git a/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h b/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h

index e1da185575265206377ac399bd4d9dc843bd79bc..3a3caca34b4abb4aaa55499e0eff5e096d8c86e8 100644 (file)
--- a/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h
+++ b/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h
@@ -28,32 +28,44 @@
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void volk_32f_x2_fm_detectpuppet_32f_a_avx(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points)
+static inline void volk_32f_x2_fm_detectpuppet_32f_a_avx(float* outputVector,
+                                                         const float* inputVector,
+                                                         float* saveValue,
+                                                         unsigned int num_points)
  {
-  const float bound = 1.0f;
+    const float bound = 1.0f;
  
-  volk_32f_s32f_32f_fm_detect_32f_a_avx(outputVector, inputVector, bound, saveValue, num_points);
+    volk_32f_s32f_32f_fm_detect_32f_a_avx(
+        outputVector, inputVector, bound, saveValue, num_points);
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void volk_32f_x2_fm_detectpuppet_32f_a_sse(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points)
+static inline void volk_32f_x2_fm_detectpuppet_32f_a_sse(float* outputVector,
+                                                         const float* inputVector,
+                                                         float* saveValue,
+                                                         unsigned int num_points)
  {
-  const float bound = 1.0f;
+    const float bound = 1.0f;
  
-  volk_32f_s32f_32f_fm_detect_32f_a_sse(outputVector, inputVector, bound, saveValue, num_points);
+    volk_32f_s32f_32f_fm_detect_32f_a_sse(
+        outputVector, inputVector, bound, saveValue, num_points);
  }
  #endif /* LV_HAVE_SSE */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_32f_x2_fm_detectpuppet_32f_generic(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points)
+static inline void volk_32f_x2_fm_detectpuppet_32f_generic(float* outputVector,
+                                                           const float* inputVector,
+                                                           float* saveValue,
+                                                           unsigned int num_points)
  {
-  const float bound = 1.0f;
+    const float bound = 1.0f;
  
-  volk_32f_s32f_32f_fm_detect_32f_generic(outputVector, inputVector, bound, saveValue, num_points);
+    volk_32f_s32f_32f_fm_detect_32f_generic(
+        outputVector, inputVector, bound, saveValue, num_points);
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -69,11 +81,15 @@ static inline void volk_32f_x2_fm_detectpuppet_32f_generic(float* outputVector,
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void volk_32f_x2_fm_detectpuppet_32f_u_avx(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points)
+static inline void volk_32f_x2_fm_detectpuppet_32f_u_avx(float* outputVector,
+                                                         const float* inputVector,
+                                                         float* saveValue,
+                                                         unsigned int num_points)
  {
-  const float bound = 1.0f;
+    const float bound = 1.0f;
  
-  volk_32f_s32f_32f_fm_detect_32f_u_avx(outputVector, inputVector, bound, saveValue, num_points);
+    volk_32f_s32f_32f_fm_detect_32f_u_avx(
+        outputVector, inputVector, bound, saveValue, num_points);
  }
  #endif /* LV_HAVE_AVX */
  #endif /* INCLUDED_volk_32f_x2_fm_detectpuppet_32f_u_H */
diff --git a/kernels/volk/volk_32f_x2_interleave_32fc.h b/kernels/volk/volk_32f_x2_interleave_32fc.h

index ef8ada2da272fd2fe8a0c5a6d42704f912f9771d..d0cc6dd467f2cdb83277001db4d43f5a3db721bc 100644 (file)
--- a/kernels/volk/volk_32f_x2_interleave_32fc.h
+++ b/kernels/volk/volk_32f_x2_interleave_32fc.h
@@ -33,8 +33,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_x2_interleave_32fc(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points)
- * \endcode
+ * void volk_32f_x2_interleave_32fc(lv_32fc_t* complexVector, const float* iBuffer, const
+ * float* qBuffer, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li iBuffer: Input vector of samples for the real part.
@@ -79,44 +79,45 @@
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_interleave_32fc_a_avx(lv_32fc_t* complexVector, const float* iBuffer,
-                                  const float* qBuffer, unsigned int num_points)
+static inline void volk_32f_x2_interleave_32fc_a_avx(lv_32fc_t* complexVector,
+                                                     const float* iBuffer,
+                                                     const float* qBuffer,
+                                                     unsigned int num_points)
  {
-  unsigned int number = 0;
-  float* complexVectorPtr = (float*)complexVector;
-  const float* iBufferPtr = iBuffer;
-  const float* qBufferPtr = qBuffer;
-
-  const uint64_t eighthPoints = num_points / 8;
-
-  __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue;
-  for(;number < eighthPoints; number++){
-    iValue = _mm256_load_ps(iBufferPtr);
-    qValue = _mm256_load_ps(qBufferPtr);
-
-    // Interleaves the lower two values in the i and q variables into one buffer
-    cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
-    // Interleaves the upper two values in the i and q variables into one buffer
-    cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
-
-    cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
-    _mm256_store_ps(complexVectorPtr, cplxValue);
-    complexVectorPtr += 8;
-
-    cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
-    _mm256_store_ps(complexVectorPtr, cplxValue);
-    complexVectorPtr += 8;
-
-    iBufferPtr += 8;
-    qBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *complexVectorPtr++ = *iBufferPtr++;
-    *complexVectorPtr++ = *qBufferPtr++;
-  }
+    unsigned int number = 0;
+    float* complexVectorPtr = (float*)complexVector;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
+
+    const uint64_t eighthPoints = num_points / 8;
+
+    __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue;
+    for (; number < eighthPoints; number++) {
+        iValue = _mm256_load_ps(iBufferPtr);
+        qValue = _mm256_load_ps(qBufferPtr);
+
+        // Interleaves the lower two values in the i and q variables into one buffer
+        cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
+        // Interleaves the upper two values in the i and q variables into one buffer
+        cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
+
+        cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+        _mm256_store_ps(complexVectorPtr, cplxValue);
+        complexVectorPtr += 8;
+
+        cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+        _mm256_store_ps(complexVectorPtr, cplxValue);
+        complexVectorPtr += 8;
+
+        iBufferPtr += 8;
+        qBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *complexVectorPtr++ = *iBufferPtr++;
+        *complexVectorPtr++ = *qBufferPtr++;
+    }
  }
  
  #endif /* LV_HAV_AVX */
@@ -124,41 +125,42 @@ volk_32f_x2_interleave_32fc_a_avx(lv_32fc_t* complexVector, const float* iBuffer
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, const float* iBuffer,
-                                  const float* qBuffer, unsigned int num_points)
+static inline void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector,
+                                                     const float* iBuffer,
+                                                     const float* qBuffer,
+                                                     unsigned int num_points)
  {
-  unsigned int number = 0;
-  float* complexVectorPtr = (float*)complexVector;
-  const float* iBufferPtr = iBuffer;
-  const float* qBufferPtr = qBuffer;
-
-  const uint64_t quarterPoints = num_points / 4;
-
-  __m128 iValue, qValue, cplxValue;
-  for(;number < quarterPoints; number++){
-    iValue = _mm_load_ps(iBufferPtr);
-    qValue = _mm_load_ps(qBufferPtr);
-
-    // Interleaves the lower two values in the i and q variables into one buffer
-    cplxValue = _mm_unpacklo_ps(iValue, qValue);
-    _mm_store_ps(complexVectorPtr, cplxValue);
-    complexVectorPtr += 4;
-
-    // Interleaves the upper two values in the i and q variables into one buffer
-    cplxValue = _mm_unpackhi_ps(iValue, qValue);
-    _mm_store_ps(complexVectorPtr, cplxValue);
-    complexVectorPtr += 4;
-
-    iBufferPtr += 4;
-    qBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    *complexVectorPtr++ = *iBufferPtr++;
-    *complexVectorPtr++ = *qBufferPtr++;
-  }
+    unsigned int number = 0;
+    float* complexVectorPtr = (float*)complexVector;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
+
+    const uint64_t quarterPoints = num_points / 4;
+
+    __m128 iValue, qValue, cplxValue;
+    for (; number < quarterPoints; number++) {
+        iValue = _mm_load_ps(iBufferPtr);
+        qValue = _mm_load_ps(qBufferPtr);
+
+        // Interleaves the lower two values in the i and q variables into one buffer
+        cplxValue = _mm_unpacklo_ps(iValue, qValue);
+        _mm_store_ps(complexVectorPtr, cplxValue);
+        complexVectorPtr += 4;
+
+        // Interleaves the upper two values in the i and q variables into one buffer
+        cplxValue = _mm_unpackhi_ps(iValue, qValue);
+        _mm_store_ps(complexVectorPtr, cplxValue);
+        complexVectorPtr += 4;
+
+        iBufferPtr += 4;
+        qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *complexVectorPtr++ = *iBufferPtr++;
+        *complexVectorPtr++ = *qBufferPtr++;
+    }
  }
  #endif /* LV_HAVE_SSE */
  
@@ -166,52 +168,53 @@ volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, const float* iBuffer
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_32f_x2_interleave_32fc_neon(lv_32fc_t* complexVector, const float* iBuffer,
-                                 const float* qBuffer, unsigned int num_points)
+static inline void volk_32f_x2_interleave_32fc_neon(lv_32fc_t* complexVector,
+                                                    const float* iBuffer,
+                                                    const float* qBuffer,
+                                                    unsigned int num_points)
  {
-  unsigned int quarter_points = num_points / 4;
-  unsigned int number;
-  float* complexVectorPtr = (float*) complexVector;
-
-  float32x4x2_t complex_vec;
-  for(number=0; number < quarter_points; ++number) {
-    complex_vec.val[0] = vld1q_f32(iBuffer);
-    complex_vec.val[1] = vld1q_f32(qBuffer);
-    vst2q_f32(complexVectorPtr, complex_vec);
-    iBuffer += 4;
-    qBuffer += 4;
-    complexVectorPtr += 8;
-  }
-
-  for(number=quarter_points * 4; number < num_points; ++number) {
-    *complexVectorPtr++ = *iBuffer++;
-    *complexVectorPtr++ = *qBuffer++;
-  }
+    unsigned int quarter_points = num_points / 4;
+    unsigned int number;
+    float* complexVectorPtr = (float*)complexVector;
+
+    float32x4x2_t complex_vec;
+    for (number = 0; number < quarter_points; ++number) {
+        complex_vec.val[0] = vld1q_f32(iBuffer);
+        complex_vec.val[1] = vld1q_f32(qBuffer);
+        vst2q_f32(complexVectorPtr, complex_vec);
+        iBuffer += 4;
+        qBuffer += 4;
+        complexVectorPtr += 8;
+    }
+
+    for (number = quarter_points * 4; number < num_points; ++number) {
+        *complexVectorPtr++ = *iBuffer++;
+        *complexVectorPtr++ = *qBuffer++;
+    }
  }
  #endif /* LV_HAVE_NEON */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_x2_interleave_32fc_generic(lv_32fc_t* complexVector, const float* iBuffer,
-                                    const float* qBuffer, unsigned int num_points)
+static inline void volk_32f_x2_interleave_32fc_generic(lv_32fc_t* complexVector,
+                                                       const float* iBuffer,
+                                                       const float* qBuffer,
+                                                       unsigned int num_points)
  {
-  float* complexVectorPtr = (float*)complexVector;
-  const float* iBufferPtr = iBuffer;
-  const float* qBufferPtr = qBuffer;
-  unsigned int number;
-
-  for(number = 0; number < num_points; number++){
-    *complexVectorPtr++ = *iBufferPtr++;
-    *complexVectorPtr++ = *qBufferPtr++;
-  }
+    float* complexVectorPtr = (float*)complexVector;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
+    unsigned int number;
+
+    for (number = 0; number < num_points; number++) {
+        *complexVectorPtr++ = *iBufferPtr++;
+        *complexVectorPtr++ = *qBufferPtr++;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
-
  #endif /* INCLUDED_volk_32f_x2_interleave_32fc_a_H */
  
  #ifndef INCLUDED_volk_32f_x2_interleave_32fc_u_H
@@ -223,44 +226,45 @@ volk_32f_x2_interleave_32fc_generic(lv_32fc_t* complexVector, const float* iBuff
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_interleave_32fc_u_avx(lv_32fc_t* complexVector, const float* iBuffer,
-                                  const float* qBuffer, unsigned int num_points)
+static inline void volk_32f_x2_interleave_32fc_u_avx(lv_32fc_t* complexVector,
+                                                     const float* iBuffer,
+                                                     const float* qBuffer,
+                                                     unsigned int num_points)
  {
-  unsigned int number = 0;
-  float* complexVectorPtr = (float*)complexVector;
-  const float* iBufferPtr = iBuffer;
-  const float* qBufferPtr = qBuffer;
-
-  const uint64_t eighthPoints = num_points / 8;
-
-  __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue;
-  for(;number < eighthPoints; number++){
-    iValue = _mm256_loadu_ps(iBufferPtr);
-    qValue = _mm256_loadu_ps(qBufferPtr);
-
-    // Interleaves the lower two values in the i and q variables into one buffer
-    cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
-    // Interleaves the upper two values in the i and q variables into one buffer
-    cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
-
-    cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
-    _mm256_storeu_ps(complexVectorPtr, cplxValue);
-    complexVectorPtr += 8;
-
-    cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
-    _mm256_storeu_ps(complexVectorPtr, cplxValue);
-    complexVectorPtr += 8;
-
-    iBufferPtr += 8;
-    qBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *complexVectorPtr++ = *iBufferPtr++;
-    *complexVectorPtr++ = *qBufferPtr++;
-  }
+    unsigned int number = 0;
+    float* complexVectorPtr = (float*)complexVector;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
+
+    const uint64_t eighthPoints = num_points / 8;
+
+    __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue;
+    for (; number < eighthPoints; number++) {
+        iValue = _mm256_loadu_ps(iBufferPtr);
+        qValue = _mm256_loadu_ps(qBufferPtr);
+
+        // Interleaves the lower two values in the i and q variables into one buffer
+        cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
+        // Interleaves the upper two values in the i and q variables into one buffer
+        cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
+
+        cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+        _mm256_storeu_ps(complexVectorPtr, cplxValue);
+        complexVectorPtr += 8;
+
+        cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+        _mm256_storeu_ps(complexVectorPtr, cplxValue);
+        complexVectorPtr += 8;
+
+        iBufferPtr += 8;
+        qBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *complexVectorPtr++ = *iBufferPtr++;
+        *complexVectorPtr++ = *qBufferPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX */
  
diff --git a/kernels/volk/volk_32f_x2_max_32f.h b/kernels/volk/volk_32f_x2_max_32f.h

index 82086a640bcdda49c4e0798a60ec4e24d3a05273..c7eb67f6480b0cdf040c7927232ca126ba7fa49e 100644 (file)
--- a/kernels/volk/volk_32f_x2_max_32f.h
+++ b/kernels/volk/volk_32f_x2_max_32f.h
@@ -32,8 +32,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_x2_max_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
- * \endcode
+ * void volk_32f_x2_max_32f(float* cVector, const float* aVector, const float* bVector,
+ * unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li aVector: First input vector.
@@ -77,176 +77,183 @@
  #ifdef LV_HAVE_AVX512F
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_max_32f_a_avx512f(float* cVector, const float* aVector,
-                          const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_max_32f_a_avx512f(float* cVector,
+                                                 const float* aVector,
+                                                 const float* bVector,
+                                                 unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m512 aVal, bVal, cVal;
-  for(;number < sixteenthPoints; number++){
-    aVal = _mm512_load_ps(aPtr);
-    bVal = _mm512_load_ps(bPtr);
+    __m512 aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
+        aVal = _mm512_load_ps(aPtr);
+        bVal = _mm512_load_ps(bPtr);
  
-    cVal = _mm512_max_ps(aVal, bVal);
+        cVal = _mm512_max_ps(aVal, bVal);
  
-    _mm512_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 16;
-    bPtr += 16;
-    cPtr += 16;
-  }
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
  
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
-    const float a = *aPtr++;
-    const float b = *bPtr++;
-    *cPtr++ = ( a > b ? a : b);
-  }
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
  }
  #endif /* LV_HAVE_AVX512F */
  
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32f_x2_max_32f_a_sse(float* cVector, const float* aVector,
-                          const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_max_32f_a_sse(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m128 aVal, bVal, cVal;
-  for(;number < quarterPoints; number++){
-    aVal = _mm_load_ps(aPtr);
-    bVal = _mm_load_ps(bPtr);
+    __m128 aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        bVal = _mm_load_ps(bPtr);
  
-    cVal = _mm_max_ps(aVal, bVal);
+        cVal = _mm_max_ps(aVal, bVal);
  
-    _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    const float a = *aPtr++;
-    const float b = *bPtr++;
-    *cPtr++ = ( a > b ? a : b);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_max_32f_a_avx(float* cVector, const float* aVector,
-                          const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_max_32f_a_avx(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m256 aVal, bVal, cVal;
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_load_ps(aPtr);
-    bVal = _mm256_load_ps(bPtr);
+    __m256 aVal, bVal, cVal;
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        bVal = _mm256_load_ps(bPtr);
  
-    cVal = _mm256_max_ps(aVal, bVal);
+        cVal = _mm256_max_ps(aVal, bVal);
  
-    _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    const float a = *aPtr++;
-    const float b = *bPtr++;
-    *cPtr++ = ( a > b ? a : b);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_32f_x2_max_32f_neon(float* cVector, const float* aVector,
-                         const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_max_32f_neon(float* cVector,
+                                            const float* aVector,
+                                            const float* bVector,
+                                            unsigned int num_points)
  {
-  unsigned int quarter_points = num_points / 4;
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
-  unsigned int number = 0;
-
-  float32x4_t a_vec, b_vec, c_vec;
-  for(number = 0; number < quarter_points; number++){
-    a_vec = vld1q_f32(aPtr);
-    b_vec = vld1q_f32(bPtr);
-    c_vec = vmaxq_f32(a_vec, b_vec);
-    vst1q_f32(cPtr, c_vec);
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
-
-  for(number = quarter_points*4; number < num_points; number++){
-    const float a = *aPtr++;
-    const float b = *bPtr++;
-    *cPtr++ = ( a > b ? a : b);
-  }
+    unsigned int quarter_points = num_points / 4;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+
+    float32x4_t a_vec, b_vec, c_vec;
+    for (number = 0; number < quarter_points; number++) {
+        a_vec = vld1q_f32(aPtr);
+        b_vec = vld1q_f32(bPtr);
+        c_vec = vmaxq_f32(a_vec, b_vec);
+        vst1q_f32(cPtr, c_vec);
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
  }
  #endif /* LV_HAVE_NEON */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_x2_max_32f_generic(float* cVector, const float* aVector,
-                            const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_max_32f_generic(float* cVector,
+                                               const float* aVector,
+                                               const float* bVector,
+                                               unsigned int num_points)
  {
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    const float a = *aPtr++;
-    const float b = *bPtr++;
-    *cPtr++ = ( a > b ? a : b);
-  }
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  #ifdef LV_HAVE_ORC
-extern void
-volk_32f_x2_max_32f_a_orc_impl(float* cVector, const float* aVector,
-                               const float* bVector, unsigned int num_points);
-
-static inline void
-volk_32f_x2_max_32f_u_orc(float* cVector, const float* aVector,
-                          const float* bVector, unsigned int num_points)
+extern void volk_32f_x2_max_32f_a_orc_impl(float* cVector,
+                                           const float* aVector,
+                                           const float* bVector,
+                                           unsigned int num_points);
+
+static inline void volk_32f_x2_max_32f_u_orc(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
  {
-  volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+    volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points);
  }
  #endif /* LV_HAVE_ORC */
  
@@ -263,74 +270,76 @@ volk_32f_x2_max_32f_u_orc(float* cVector, const float* aVector,
  #ifdef LV_HAVE_AVX512F
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_max_32f_u_avx512f(float* cVector, const float* aVector,
-                          const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_max_32f_u_avx512f(float* cVector,
+                                                 const float* aVector,
+                                                 const float* bVector,
+                                                 unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m512 aVal, bVal, cVal;
-  for(;number < sixteenthPoints; number++){
-    aVal = _mm512_loadu_ps(aPtr);
-    bVal = _mm512_loadu_ps(bPtr);
+    __m512 aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
+        aVal = _mm512_loadu_ps(aPtr);
+        bVal = _mm512_loadu_ps(bPtr);
  
-    cVal = _mm512_max_ps(aVal, bVal);
+        cVal = _mm512_max_ps(aVal, bVal);
  
-    _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container
+        _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 16;
-    bPtr += 16;
-    cPtr += 16;
-  }
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
  
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
-    const float a = *aPtr++;
-    const float b = *bPtr++;
-    *cPtr++ = ( a > b ? a : b);
-  }
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
  }
  #endif /* LV_HAVE_AVX512F */
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_max_32f_u_avx(float* cVector, const float* aVector,
-                          const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_max_32f_u_avx(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m256 aVal, bVal, cVal;
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_loadu_ps(aPtr);
-    bVal = _mm256_loadu_ps(bPtr);
+    __m256 aVal, bVal, cVal;
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        bVal = _mm256_loadu_ps(bPtr);
  
-    cVal = _mm256_max_ps(aVal, bVal);
+        cVal = _mm256_max_ps(aVal, bVal);
  
-    _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+        _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    const float a = *aPtr++;
-    const float b = *bPtr++;
-    *cPtr++ = ( a > b ? a : b);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
diff --git a/kernels/volk/volk_32f_x2_min_32f.h b/kernels/volk/volk_32f_x2_min_32f.h

index 454eb7665266e40149e21f63892418b124596abd..aecd11a87e5f5721129fc3a9c7b63927c61cc610 100644 (file)
--- a/kernels/volk/volk_32f_x2_min_32f.h
+++ b/kernels/volk/volk_32f_x2_min_32f.h
@@ -32,8 +32,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_x2_min_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
- * \endcode
+ * void volk_32f_x2_min_32f(float* cVector, const float* aVector, const float* bVector,
+ * unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li aVector: First input vector.
@@ -77,37 +77,38 @@
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVector,
-                          const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_min_32f_a_sse(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m128 aVal, bVal, cVal;
-  for(;number < quarterPoints; number++){
-    aVal = _mm_load_ps(aPtr);
-    bVal = _mm_load_ps(bPtr);
+    __m128 aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        bVal = _mm_load_ps(bPtr);
  
-    cVal = _mm_min_ps(aVal, bVal);
+        cVal = _mm_min_ps(aVal, bVal);
  
-    _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    const float a = *aPtr++;
-    const float b = *bPtr++;
-    *cPtr++ = ( a < b ? a : b);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
  }
  #endif /* LV_HAVE_SSE */
  
@@ -115,143 +116,149 @@ volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVector,
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_32f_x2_min_32f_neon(float* cVector, const float* aVector,
-                         const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_min_32f_neon(float* cVector,
+                                            const float* aVector,
+                                            const float* bVector,
+                                            unsigned int num_points)
  {
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
-  unsigned int number = 0;
-  unsigned int quarter_points = num_points / 4;
-
-  float32x4_t a_vec, b_vec, c_vec;
-  for(number = 0; number < quarter_points; number++){
-    a_vec = vld1q_f32(aPtr);
-    b_vec = vld1q_f32(bPtr);
-
-    c_vec = vminq_f32(a_vec, b_vec);
-
-    vst1q_f32(cPtr, c_vec);
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
-
-  for(number = quarter_points*4; number < num_points; number++){
-    const float a = *aPtr++;
-    const float b = *bPtr++;
-    *cPtr++ = ( a < b ? a : b);
-  }
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+
+    float32x4_t a_vec, b_vec, c_vec;
+    for (number = 0; number < quarter_points; number++) {
+        a_vec = vld1q_f32(aPtr);
+        b_vec = vld1q_f32(bPtr);
+
+        c_vec = vminq_f32(a_vec, b_vec);
+
+        vst1q_f32(cPtr, c_vec);
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
  }
  #endif /* LV_HAVE_NEON */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_x2_min_32f_generic(float* cVector, const float* aVector,
-                            const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_min_32f_generic(float* cVector,
+                                               const float* aVector,
+                                               const float* bVector,
+                                               unsigned int num_points)
  {
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    const float a = *aPtr++;
-    const float b = *bPtr++;
-    *cPtr++ = ( a < b ? a : b);
-  }
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
  #ifdef LV_HAVE_ORC
  
-extern void
-volk_32f_x2_min_32f_a_orc_impl(float* cVector, const float* aVector,
-                               const float* bVector, unsigned int num_points);
+extern void volk_32f_x2_min_32f_a_orc_impl(float* cVector,
+                                           const float* aVector,
+                                           const float* bVector,
+                                           unsigned int num_points);
  
-static inline void
-volk_32f_x2_min_32f_u_orc(float* cVector, const float* aVector,
-                          const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_min_32f_u_orc(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
  {
-  volk_32f_x2_min_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+    volk_32f_x2_min_32f_a_orc_impl(cVector, aVector, bVector, num_points);
  }
  #endif /* LV_HAVE_ORC */
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_min_32f_a_avx(float* cVector, const float* aVector,
-                          const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_min_32f_a_avx(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m256 aVal, bVal, cVal;
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_load_ps(aPtr);
-    bVal = _mm256_load_ps(bPtr);
+    __m256 aVal, bVal, cVal;
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        bVal = _mm256_load_ps(bPtr);
  
-    cVal = _mm256_min_ps(aVal, bVal);
+        cVal = _mm256_min_ps(aVal, bVal);
  
-    _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    const float a = *aPtr++;
-    const float b = *bPtr++;
-    *cPtr++ = ( a < b ? a : b);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_AVX512F
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_min_32f_a_avx512f(float* cVector, const float* aVector,
-                          const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_min_32f_a_avx512f(float* cVector,
+                                                 const float* aVector,
+                                                 const float* bVector,
+                                                 unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m512 aVal, bVal, cVal;
-  for(;number < sixteenthPoints; number++){
-    aVal = _mm512_load_ps(aPtr);
-    bVal = _mm512_load_ps(bPtr);
+    __m512 aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
+        aVal = _mm512_load_ps(aPtr);
+        bVal = _mm512_load_ps(bPtr);
  
-    cVal = _mm512_min_ps(aVal, bVal);
+        cVal = _mm512_min_ps(aVal, bVal);
  
-    _mm512_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 16;
-    bPtr += 16;
-    cPtr += 16;
-  }
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
  
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
-    const float a = *aPtr++;
-    const float b = *bPtr++;
-    *cPtr++ = ( a < b ? a : b);
-  }
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
  }
  #endif /* LV_HAVE_AVX512F */
  
@@ -267,74 +274,76 @@ volk_32f_x2_min_32f_a_avx512f(float* cVector, const float* aVector,
  #ifdef LV_HAVE_AVX512F
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_min_32f_u_avx512f(float* cVector, const float* aVector,
-                          const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_min_32f_u_avx512f(float* cVector,
+                                                 const float* aVector,
+                                                 const float* bVector,
+                                                 unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m512 aVal, bVal, cVal;
-  for(;number < sixteenthPoints; number++){
-    aVal = _mm512_loadu_ps(aPtr);
-    bVal = _mm512_loadu_ps(bPtr);
+    __m512 aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
+        aVal = _mm512_loadu_ps(aPtr);
+        bVal = _mm512_loadu_ps(bPtr);
  
-    cVal = _mm512_min_ps(aVal, bVal);
+        cVal = _mm512_min_ps(aVal, bVal);
  
-    _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container
+        _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 16;
-    bPtr += 16;
-    cPtr += 16;
-  }
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
  
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
-    const float a = *aPtr++;
-    const float b = *bPtr++;
-    *cPtr++ = ( a < b ? a : b);
-  }
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
  }
  #endif /* LV_HAVE_AVX512F */
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_min_32f_u_avx(float* cVector, const float* aVector,
-                          const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_min_32f_u_avx(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m256 aVal, bVal, cVal;
-  for(;number < eighthPoints; number++){
-    aVal = _mm256_loadu_ps(aPtr);
-    bVal = _mm256_loadu_ps(bPtr);
+    __m256 aVal, bVal, cVal;
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        bVal = _mm256_loadu_ps(bPtr);
  
-    cVal = _mm256_min_ps(aVal, bVal);
+        cVal = _mm256_min_ps(aVal, bVal);
  
-    _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+        _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    const float a = *aPtr++;
-    const float b = *bPtr++;
-    *cPtr++ = ( a < b ? a : b);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
diff --git a/kernels/volk/volk_32f_x2_multiply_32f.h b/kernels/volk/volk_32f_x2_multiply_32f.h

index deb9ae33dbf4fc55cf8c2e502eae4427976383f4..eebba188b06acc264f263d2f276354811edf9576 100644 (file)
--- a/kernels/volk/volk_32f_x2_multiply_32f.h
+++ b/kernels/volk/volk_32f_x2_multiply_32f.h
@@ -31,8 +31,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_x2_multiply_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
- * \endcode
+ * void volk_32f_x2_multiply_32f(float* cVector, const float* aVector, const float*
+ * bVector, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li aVector: First input vector.
@@ -77,126 +77,130 @@
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* aVector,
-                               const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector,
+                                                  const float* aVector,
+                                                  const float* bVector,
+                                                  unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m128 aVal, bVal, cVal;
-  for(;number < quarterPoints; number++){
+    __m128 aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
  
-    aVal = _mm_loadu_ps(aPtr);
-    bVal = _mm_loadu_ps(bPtr);
+        aVal = _mm_loadu_ps(aPtr);
+        bVal = _mm_loadu_ps(bPtr);
  
-    cVal = _mm_mul_ps(aVal, bVal);
+        cVal = _mm_mul_ps(aVal, bVal);
  
-    _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+        _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) * (*bPtr++);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  #ifdef LV_HAVE_AVX512F
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_multiply_32f_u_avx512f(float* cVector, const float* aVector,
-                               const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_multiply_32f_u_avx512f(float* cVector,
+                                                      const float* aVector,
+                                                      const float* bVector,
+                                                      unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m512 aVal, bVal, cVal;
-  for(;number < sixteenthPoints; number++){
+    __m512 aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
  
-    aVal = _mm512_loadu_ps(aPtr);
-    bVal = _mm512_loadu_ps(bPtr);
+        aVal = _mm512_loadu_ps(aPtr);
+        bVal = _mm512_loadu_ps(bPtr);
  
-    cVal = _mm512_mul_ps(aVal, bVal);
+        cVal = _mm512_mul_ps(aVal, bVal);
  
-    _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container
+        _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 16;
-    bPtr += 16;
-    cPtr += 16;
-  }
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
  
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) * (*bPtr++);
-  }
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_AVX512F */
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* aVector,
-                               const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector,
+                                                  const float* aVector,
+                                                  const float* bVector,
+                                                  unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m256 aVal, bVal, cVal;
-  for(;number < eighthPoints; number++){
+    __m256 aVal, bVal, cVal;
+    for (; number < eighthPoints; number++) {
  
-    aVal = _mm256_loadu_ps(aPtr);
-    bVal = _mm256_loadu_ps(bPtr);
+        aVal = _mm256_loadu_ps(aPtr);
+        bVal = _mm256_loadu_ps(bPtr);
  
-    cVal = _mm256_mul_ps(aVal, bVal);
+        cVal = _mm256_mul_ps(aVal, bVal);
  
-    _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+        _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) * (*bPtr++);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_x2_multiply_32f_generic(float* cVector, const float* aVector,
-                                 const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_multiply_32f_generic(float* cVector,
+                                                    const float* aVector,
+                                                    const float* bVector,
+                                                    unsigned int num_points)
  {
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = (*aPtr++) * (*bPtr++);
-  }
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -213,72 +217,74 @@ volk_32f_x2_multiply_32f_generic(float* cVector, const float* aVector,
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32f_x2_multiply_32f_a_sse(float* cVector, const float* aVector,
-                               const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_multiply_32f_a_sse(float* cVector,
+                                                  const float* aVector,
+                                                  const float* bVector,
+                                                  unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m128 aVal, bVal, cVal;
-  for(;number < quarterPoints; number++){
+    __m128 aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
  
-    aVal = _mm_load_ps(aPtr);
-    bVal = _mm_load_ps(bPtr);
+        aVal = _mm_load_ps(aPtr);
+        bVal = _mm_load_ps(bPtr);
  
-    cVal = _mm_mul_ps(aVal, bVal);
+        cVal = _mm_mul_ps(aVal, bVal);
  
-    _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) * (*bPtr++);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  #ifdef LV_HAVE_AVX512F
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_multiply_32f_a_avx512f(float* cVector, const float* aVector,
-                               const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_multiply_32f_a_avx512f(float* cVector,
+                                                      const float* aVector,
+                                                      const float* bVector,
+                                                      unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m512 aVal, bVal, cVal;
-  for(;number < sixteenthPoints; number++){
+    __m512 aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
  
-    aVal = _mm512_load_ps(aPtr);
-    bVal = _mm512_load_ps(bPtr);
+        aVal = _mm512_load_ps(aPtr);
+        bVal = _mm512_load_ps(bPtr);
  
-    cVal = _mm512_mul_ps(aVal, bVal);
+        cVal = _mm512_mul_ps(aVal, bVal);
  
-    _mm512_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 16;
-    bPtr += 16;
-    cPtr += 16;
-  }
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
  
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) * (*bPtr++);
-  }
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_AVX512F */
  
@@ -286,36 +292,37 @@ volk_32f_x2_multiply_32f_a_avx512f(float* cVector, const float* aVector,
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_multiply_32f_a_avx(float* cVector, const float* aVector,
-                               const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_multiply_32f_a_avx(float* cVector,
+                                                  const float* aVector,
+                                                  const float* bVector,
+                                                  unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m256 aVal, bVal, cVal;
-  for(;number < eighthPoints; number++){
+    __m256 aVal, bVal, cVal;
+    for (; number < eighthPoints; number++) {
  
-    aVal = _mm256_load_ps(aPtr);
-    bVal = _mm256_load_ps(bPtr);
+        aVal = _mm256_load_ps(aPtr);
+        bVal = _mm256_load_ps(bPtr);
  
-    cVal = _mm256_mul_ps(aVal, bVal);
+        cVal = _mm256_mul_ps(aVal, bVal);
  
-    _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) * (*bPtr++);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
@@ -323,57 +330,61 @@ volk_32f_x2_multiply_32f_a_avx(float* cVector, const float* aVector,
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_32f_x2_multiply_32f_neon(float* cVector, const float* aVector,
-                              const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_multiply_32f_neon(float* cVector,
+                                                 const float* aVector,
+                                                 const float* bVector,
+                                                 unsigned int num_points)
  {
-  const unsigned int quarter_points = num_points / 4;
-  unsigned int number;
-  float32x4_t avec, bvec, cvec;
-  for(number=0; number < quarter_points; ++number) {
-    avec = vld1q_f32(aVector);
-    bvec = vld1q_f32(bVector);
-    cvec = vmulq_f32(avec, bvec);
-    vst1q_f32(cVector, cvec);
-    aVector += 4;
-    bVector += 4;
-    cVector += 4;
-  }
-  for(number=quarter_points*4; number < num_points; ++number) {
-    *cVector++ = *aVector++ * *bVector++;
-  }
+    const unsigned int quarter_points = num_points / 4;
+    unsigned int number;
+    float32x4_t avec, bvec, cvec;
+    for (number = 0; number < quarter_points; ++number) {
+        avec = vld1q_f32(aVector);
+        bvec = vld1q_f32(bVector);
+        cvec = vmulq_f32(avec, bvec);
+        vst1q_f32(cVector, cvec);
+        aVector += 4;
+        bVector += 4;
+        cVector += 4;
+    }
+    for (number = quarter_points * 4; number < num_points; ++number) {
+        *cVector++ = *aVector++ * *bVector++;
+    }
  }
  #endif /* LV_HAVE_NEON */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_x2_multiply_32f_a_generic(float* cVector, const float* aVector,
-                                   const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_multiply_32f_a_generic(float* cVector,
+                                                      const float* aVector,
+                                                      const float* bVector,
+                                                      unsigned int num_points)
  {
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr=  bVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = (*aPtr++) * (*bPtr++);
-  }
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
  #ifdef LV_HAVE_ORC
-extern void
-volk_32f_x2_multiply_32f_a_orc_impl(float* cVector, const float* aVector,
-                                    const float* bVector, unsigned int num_points);
-
-static inline void
-volk_32f_x2_multiply_32f_u_orc(float* cVector, const float* aVector,
-                               const float* bVector, unsigned int num_points)
+extern void volk_32f_x2_multiply_32f_a_orc_impl(float* cVector,
+                                                const float* aVector,
+                                                const float* bVector,
+                                                unsigned int num_points);
+
+static inline void volk_32f_x2_multiply_32f_u_orc(float* cVector,
+                                                  const float* aVector,
+                                                  const float* bVector,
+                                                  unsigned int num_points)
  {
-  volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+    volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points);
  }
  #endif /* LV_HAVE_ORC */
  
diff --git a/kernels/volk/volk_32f_x2_pow_32f.h b/kernels/volk/volk_32f_x2_pow_32f.h

index daa7f4e12438e94079ad1aa26e66e3c740a1b4a3..106c57bcd26fe616bb90e7387639c8f375d3f8c0 100644 (file)
--- a/kernels/volk/volk_32f_x2_pow_32f.h
+++ b/kernels/volk/volk_32f_x2_pow_32f.h
@@ -31,8 +31,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_x2_pow_32f(float* cVector, const float* bVector, const float* aVector, unsigned int num_points)
- * \endcode
+ * void volk_32f_x2_pow_32f(float* cVector, const float* bVector, const float* aVector,
+ * unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li bVector: The input vector of indices (power values).
@@ -71,10 +71,10 @@
  #ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
  #define INCLUDED_volk_32f_x2_pow_32f_a_H
  
-#include <stdio.h>
-#include <stdlib.h>
  #include <inttypes.h>
  #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
  
  #define POW_POLY_DEGREE 3
  
@@ -82,99 +82,130 @@
  #include <immintrin.h>
  
  #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
-#define POLY1_AVX2_FMA(x, c0, c1) _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
-#define POLY2_AVX2_FMA(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
-#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
-#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
-#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
-
-static inline void
-volk_32f_x2_pow_32f_a_avx2_fma(float* cVector, const float* bVector,
-                             const float* aVector, unsigned int num_points)
+#define POLY1_AVX2_FMA(x, c0, c1) \
+    _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
+#define POLY2_AVX2_FMA(x, c0, c1, c2) \
+    _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
+#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
+    _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
+#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
+    _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
+#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
+    _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
+
+static inline void volk_32f_x2_pow_32f_a_avx2_fma(float* cVector,
+                                                  const float* bVector,
+                                                  const float* aVector,
+                                                  unsigned int num_points)
  {
-  float* cPtr = cVector;
-  const float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
-  __m256 tmp, fx, mask, pow2n, z, y;
-  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
-  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
-  __m256i bias, exp, emm0, pi32_0x7f;
-
-  one = _mm256_set1_ps(1.0);
-  exp_hi = _mm256_set1_ps(88.3762626647949);
-  exp_lo = _mm256_set1_ps(-88.3762626647949);
-  ln2 = _mm256_set1_ps(0.6931471805);
-  log2EF = _mm256_set1_ps(1.44269504088896341);
-  half = _mm256_set1_ps(0.5);
-  exp_C1 = _mm256_set1_ps(0.693359375);
-  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
-  pi32_0x7f = _mm256_set1_epi32(0x7f);
-
-  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
-  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
-  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
-  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
-  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
-  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
-
-  for(;number < eighthPoints; number++){
-    // First compute the logarithm
-    aVal = _mm256_load_ps(aPtr);
-    bias = _mm256_set1_epi32(127);
-    leadingOne = _mm256_set1_ps(1.0f);
-    exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
-    logarithm = _mm256_cvtepi32_ps(exp);
-
-    frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+    float* cPtr = cVector;
+    const float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+    __m256 tmp, fx, mask, pow2n, z, y;
+    __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+    __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+    __m256i bias, exp, emm0, pi32_0x7f;
+
+    one = _mm256_set1_ps(1.0);
+    exp_hi = _mm256_set1_ps(88.3762626647949);
+    exp_lo = _mm256_set1_ps(-88.3762626647949);
+    ln2 = _mm256_set1_ps(0.6931471805);
+    log2EF = _mm256_set1_ps(1.44269504088896341);
+    half = _mm256_set1_ps(0.5);
+    exp_C1 = _mm256_set1_ps(0.693359375);
+    exp_C2 = _mm256_set1_ps(-2.12194440e-4);
+    pi32_0x7f = _mm256_set1_epi32(0x7f);
+
+    exp_p0 = _mm256_set1_ps(1.9875691500e-4);
+    exp_p1 = _mm256_set1_ps(1.3981999507e-3);
+    exp_p2 = _mm256_set1_ps(8.3334519073e-3);
+    exp_p3 = _mm256_set1_ps(4.1665795894e-2);
+    exp_p4 = _mm256_set1_ps(1.6666665459e-1);
+    exp_p5 = _mm256_set1_ps(5.0000001201e-1);
+
+    for (; number < eighthPoints; number++) {
+        // First compute the logarithm
+        aVal = _mm256_load_ps(aPtr);
+        bias = _mm256_set1_epi32(127);
+        leadingOne = _mm256_set1_ps(1.0f);
+        exp = _mm256_sub_epi32(
+            _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
+                                               _mm256_set1_epi32(0x7f800000)),
+                              23),
+            bias);
+        logarithm = _mm256_cvtepi32_ps(exp);
+
+        frac = _mm256_or_ps(
+            leadingOne,
+            _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
  
  #if POW_POLY_DEGREE == 6
-    mantissa = POLY5_AVX2_FMA( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f,  3.1821337e-1f, -3.4436006e-2f);
+        mantissa = POLY5_AVX2_FMA(frac,
+                                  3.1157899f,
+                                  -3.3241990f,
+                                  2.5988452f,
+                                  -1.2315303f,
+                                  3.1821337e-1f,
+                                  -3.4436006e-2f);
  #elif POW_POLY_DEGREE == 5
-    mantissa = POLY4_AVX2_FMA( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+        mantissa = POLY4_AVX2_FMA(frac,
+                                  2.8882704548164776201f,
+                                  -2.52074962577807006663f,
+                                  1.48116647521213171641f,
+                                  -0.465725644288844778798f,
+                                  0.0596515482674574969533f);
  #elif POW_POLY_DEGREE == 4
-    mantissa = POLY3_AVX2_FMA( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+        mantissa = POLY3_AVX2_FMA(frac,
+                                  2.61761038894603480148f,
+                                  -1.75647175389045657003f,
+                                  0.688243882994381274313f,
+                                  -0.107254423828329604454f);
  #elif POW_POLY_DEGREE == 3
-    mantissa = POLY2_AVX2_FMA( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+        mantissa = POLY2_AVX2_FMA(frac,
+                                  2.28330284476918490682f,
+                                  -1.04913055217340124191f,
+                                  0.204446009836232697516f);
  #else
  #error
  #endif
  
-    logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
-    logarithm = _mm256_mul_ps(logarithm, ln2);
+        logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
+        logarithm = _mm256_mul_ps(logarithm, ln2);
  
-    // Now calculate b*lna
-    bVal = _mm256_load_ps(bPtr);
-    bVal = _mm256_mul_ps(bVal, logarithm);
+        // Now calculate b*lna
+        bVal = _mm256_load_ps(bPtr);
+        bVal = _mm256_mul_ps(bVal, logarithm);
  
-    // Now compute exp(b*lna)
-    bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
+        // Now compute exp(b*lna)
+        bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
  
-    fx = _mm256_fmadd_ps(bVal, log2EF, half);
+        fx = _mm256_fmadd_ps(bVal, log2EF, half);
  
-    emm0 = _mm256_cvttps_epi32(fx);
-    tmp = _mm256_cvtepi32_ps(emm0);
+        emm0 = _mm256_cvttps_epi32(fx);
+        tmp = _mm256_cvtepi32_ps(emm0);
  
-    mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
-    fx = _mm256_sub_ps(tmp, mask);
+        mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
+        fx = _mm256_sub_ps(tmp, mask);
  
-    tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
-    bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
-    z = _mm256_mul_ps(bVal, bVal);
+        tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
+        bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
+        z = _mm256_mul_ps(bVal, bVal);
  
-    y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
-    y = _mm256_fmadd_ps(y, bVal, exp_p2);
-    y = _mm256_fmadd_ps(y, bVal, exp_p3);
-    y = _mm256_fmadd_ps(y, bVal, exp_p4);
-    y = _mm256_fmadd_ps(y, bVal, exp_p5);
-    y = _mm256_fmadd_ps(y, z, bVal);
-    y = _mm256_add_ps(y, one);
+        y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
+        y = _mm256_fmadd_ps(y, bVal, exp_p2);
+        y = _mm256_fmadd_ps(y, bVal, exp_p3);
+        y = _mm256_fmadd_ps(y, bVal, exp_p4);
+        y = _mm256_fmadd_ps(y, bVal, exp_p5);
+        y = _mm256_fmadd_ps(y, z, bVal);
+        y = _mm256_add_ps(y, one);
  
-    emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
+        emm0 =
+            _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
  
          pow2n = _mm256_castsi256_ps(emm0);
          cVal = _mm256_mul_ps(y, pow2n);
@@ -184,12 +215,12 @@ volk_32f_x2_pow_32f_a_avx2_fma(float* cVector, const float* bVector,
          aPtr += 8;
          bPtr += 8;
          cPtr += 8;
-  }
+    }
  
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *cPtr++ = pow(*aPtr++, *bPtr++);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = pow(*aPtr++, *bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
@@ -198,99 +229,131 @@ volk_32f_x2_pow_32f_a_avx2_fma(float* cVector, const float* bVector,
  #include <immintrin.h>
  
  #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
-#define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
-#define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
-#define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
-#define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
-#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
-
-static inline void
-volk_32f_x2_pow_32f_a_avx2(float* cVector, const float* bVector,
-                             const float* aVector, unsigned int num_points)
+#define POLY1_AVX2(x, c0, c1) \
+    _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
+#define POLY2_AVX2(x, c0, c1, c2) \
+    _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
+#define POLY3_AVX2(x, c0, c1, c2, c3) \
+    _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
+#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
+    _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
+#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
+    _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
+
+static inline void volk_32f_x2_pow_32f_a_avx2(float* cVector,
+                                              const float* bVector,
+                                              const float* aVector,
+                                              unsigned int num_points)
  {
-  float* cPtr = cVector;
-  const float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
-  __m256 tmp, fx, mask, pow2n, z, y;
-  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
-  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
-  __m256i bias, exp, emm0, pi32_0x7f;
-
-  one = _mm256_set1_ps(1.0);
-  exp_hi = _mm256_set1_ps(88.3762626647949);
-  exp_lo = _mm256_set1_ps(-88.3762626647949);
-  ln2 = _mm256_set1_ps(0.6931471805);
-  log2EF = _mm256_set1_ps(1.44269504088896341);
-  half = _mm256_set1_ps(0.5);
-  exp_C1 = _mm256_set1_ps(0.693359375);
-  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
-  pi32_0x7f = _mm256_set1_epi32(0x7f);
-
-  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
-  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
-  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
-  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
-  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
-  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
-
-  for(;number < eighthPoints; number++){
-    // First compute the logarithm
-    aVal = _mm256_load_ps(aPtr);
-    bias = _mm256_set1_epi32(127);
-    leadingOne = _mm256_set1_ps(1.0f);
-    exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
-    logarithm = _mm256_cvtepi32_ps(exp);
-
-    frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+    float* cPtr = cVector;
+    const float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+    __m256 tmp, fx, mask, pow2n, z, y;
+    __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+    __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+    __m256i bias, exp, emm0, pi32_0x7f;
+
+    one = _mm256_set1_ps(1.0);
+    exp_hi = _mm256_set1_ps(88.3762626647949);
+    exp_lo = _mm256_set1_ps(-88.3762626647949);
+    ln2 = _mm256_set1_ps(0.6931471805);
+    log2EF = _mm256_set1_ps(1.44269504088896341);
+    half = _mm256_set1_ps(0.5);
+    exp_C1 = _mm256_set1_ps(0.693359375);
+    exp_C2 = _mm256_set1_ps(-2.12194440e-4);
+    pi32_0x7f = _mm256_set1_epi32(0x7f);
+
+    exp_p0 = _mm256_set1_ps(1.9875691500e-4);
+    exp_p1 = _mm256_set1_ps(1.3981999507e-3);
+    exp_p2 = _mm256_set1_ps(8.3334519073e-3);
+    exp_p3 = _mm256_set1_ps(4.1665795894e-2);
+    exp_p4 = _mm256_set1_ps(1.6666665459e-1);
+    exp_p5 = _mm256_set1_ps(5.0000001201e-1);
+
+    for (; number < eighthPoints; number++) {
+        // First compute the logarithm
+        aVal = _mm256_load_ps(aPtr);
+        bias = _mm256_set1_epi32(127);
+        leadingOne = _mm256_set1_ps(1.0f);
+        exp = _mm256_sub_epi32(
+            _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
+                                               _mm256_set1_epi32(0x7f800000)),
+                              23),
+            bias);
+        logarithm = _mm256_cvtepi32_ps(exp);
+
+        frac = _mm256_or_ps(
+            leadingOne,
+            _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
  
  #if POW_POLY_DEGREE == 6
-    mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f,  3.1821337e-1f, -3.4436006e-2f);
+        mantissa = POLY5_AVX2(frac,
+                              3.1157899f,
+                              -3.3241990f,
+                              2.5988452f,
+                              -1.2315303f,
+                              3.1821337e-1f,
+                              -3.4436006e-2f);
  #elif POW_POLY_DEGREE == 5
-    mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+        mantissa = POLY4_AVX2(frac,
+                              2.8882704548164776201f,
+                              -2.52074962577807006663f,
+                              1.48116647521213171641f,
+                              -0.465725644288844778798f,
+                              0.0596515482674574969533f);
  #elif POW_POLY_DEGREE == 4
-    mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+        mantissa = POLY3_AVX2(frac,
+                              2.61761038894603480148f,
+                              -1.75647175389045657003f,
+                              0.688243882994381274313f,
+                              -0.107254423828329604454f);
  #elif POW_POLY_DEGREE == 3
-    mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+        mantissa = POLY2_AVX2(frac,
+                              2.28330284476918490682f,
+                              -1.04913055217340124191f,
+                              0.204446009836232697516f);
  #else
  #error
  #endif
  
-    logarithm = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
-    logarithm = _mm256_mul_ps(logarithm, ln2);
+        logarithm = _mm256_add_ps(
+            _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
+        logarithm = _mm256_mul_ps(logarithm, ln2);
  
-    // Now calculate b*lna
-    bVal = _mm256_load_ps(bPtr);
-    bVal = _mm256_mul_ps(bVal, logarithm);
+        // Now calculate b*lna
+        bVal = _mm256_load_ps(bPtr);
+        bVal = _mm256_mul_ps(bVal, logarithm);
  
-    // Now compute exp(b*lna)
-    bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
+        // Now compute exp(b*lna)
+        bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
  
-    fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
+        fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
  
-    emm0 = _mm256_cvttps_epi32(fx);
-    tmp = _mm256_cvtepi32_ps(emm0);
+        emm0 = _mm256_cvttps_epi32(fx);
+        tmp = _mm256_cvtepi32_ps(emm0);
  
-    mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
-    fx = _mm256_sub_ps(tmp, mask);
+        mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
+        fx = _mm256_sub_ps(tmp, mask);
  
-    tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
-    bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
-    z = _mm256_mul_ps(bVal, bVal);
+        tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
+        bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
+        z = _mm256_mul_ps(bVal, bVal);
  
-    y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
-    y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
-    y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
-    y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
-    y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
-    y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
-    y = _mm256_add_ps(y, one);
+        y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
+        y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
+        y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
+        y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
+        y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
+        y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
+        y = _mm256_add_ps(y, one);
  
-    emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
+        emm0 =
+            _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
  
          pow2n = _mm256_castsi256_ps(emm0);
          cVal = _mm256_mul_ps(y, pow2n);
@@ -300,12 +363,12 @@ volk_32f_x2_pow_32f_a_avx2(float* cVector, const float* bVector,
          aPtr += 8;
          bPtr += 8;
          cPtr += 8;
-  }
+    }
  
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *cPtr++ = pow(*aPtr++, *bPtr++);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = pow(*aPtr++, *bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX2 for aligned */
@@ -317,97 +380,124 @@ volk_32f_x2_pow_32f_a_avx2(float* cVector, const float* bVector,
  #define POLY0(x, c0) _mm_set1_ps(c0)
  #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
  #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
-#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
-#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
-#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
-
-static inline void
-volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float* bVector,
-                             const float* aVector, unsigned int num_points)
+#define POLY3(x, c0, c1, c2, c3) \
+    _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) \
+    _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) \
+    _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector,
+                                                const float* bVector,
+                                                const float* aVector,
+                                                unsigned int num_points)
  {
-  float* cPtr = cVector;
-  const float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
-  __m128 tmp, fx, mask, pow2n, z, y;
-  __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
-  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
-  __m128i bias, exp, emm0, pi32_0x7f;
-
-  one = _mm_set1_ps(1.0);
-  exp_hi = _mm_set1_ps(88.3762626647949);
-  exp_lo = _mm_set1_ps(-88.3762626647949);
-  ln2 = _mm_set1_ps(0.6931471805);
-  log2EF = _mm_set1_ps(1.44269504088896341);
-  half = _mm_set1_ps(0.5);
-  exp_C1 = _mm_set1_ps(0.693359375);
-  exp_C2 = _mm_set1_ps(-2.12194440e-4);
-  pi32_0x7f = _mm_set1_epi32(0x7f);
-
-  exp_p0 = _mm_set1_ps(1.9875691500e-4);
-  exp_p1 = _mm_set1_ps(1.3981999507e-3);
-  exp_p2 = _mm_set1_ps(8.3334519073e-3);
-  exp_p3 = _mm_set1_ps(4.1665795894e-2);
-  exp_p4 = _mm_set1_ps(1.6666665459e-1);
-  exp_p5 = _mm_set1_ps(5.0000001201e-1);
-
-  for(;number < quarterPoints; number++){
-    // First compute the logarithm
-    aVal = _mm_load_ps(aPtr);
-    bias = _mm_set1_epi32(127);
-    leadingOne = _mm_set1_ps(1.0f);
-    exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
-    logarithm = _mm_cvtepi32_ps(exp);
-
-    frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+    float* cPtr = cVector;
+    const float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+    __m128 tmp, fx, mask, pow2n, z, y;
+    __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+    __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+    __m128i bias, exp, emm0, pi32_0x7f;
+
+    one = _mm_set1_ps(1.0);
+    exp_hi = _mm_set1_ps(88.3762626647949);
+    exp_lo = _mm_set1_ps(-88.3762626647949);
+    ln2 = _mm_set1_ps(0.6931471805);
+    log2EF = _mm_set1_ps(1.44269504088896341);
+    half = _mm_set1_ps(0.5);
+    exp_C1 = _mm_set1_ps(0.693359375);
+    exp_C2 = _mm_set1_ps(-2.12194440e-4);
+    pi32_0x7f = _mm_set1_epi32(0x7f);
+
+    exp_p0 = _mm_set1_ps(1.9875691500e-4);
+    exp_p1 = _mm_set1_ps(1.3981999507e-3);
+    exp_p2 = _mm_set1_ps(8.3334519073e-3);
+    exp_p3 = _mm_set1_ps(4.1665795894e-2);
+    exp_p4 = _mm_set1_ps(1.6666665459e-1);
+    exp_p5 = _mm_set1_ps(5.0000001201e-1);
+
+    for (; number < quarterPoints; number++) {
+        // First compute the logarithm
+        aVal = _mm_load_ps(aPtr);
+        bias = _mm_set1_epi32(127);
+        leadingOne = _mm_set1_ps(1.0f);
+        exp = _mm_sub_epi32(
+            _mm_srli_epi32(
+                _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
+            bias);
+        logarithm = _mm_cvtepi32_ps(exp);
+
+        frac = _mm_or_ps(leadingOne,
+                         _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
  
  #if POW_POLY_DEGREE == 6
-    mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f,  3.1821337e-1f, -3.4436006e-2f);
+        mantissa = POLY5(frac,
+                         3.1157899f,
+                         -3.3241990f,
+                         2.5988452f,
+                         -1.2315303f,
+                         3.1821337e-1f,
+                         -3.4436006e-2f);
  #elif POW_POLY_DEGREE == 5
-    mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+        mantissa = POLY4(frac,
+                         2.8882704548164776201f,
+                         -2.52074962577807006663f,
+                         1.48116647521213171641f,
+                         -0.465725644288844778798f,
+                         0.0596515482674574969533f);
  #elif POW_POLY_DEGREE == 4
-    mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+        mantissa = POLY3(frac,
+                         2.61761038894603480148f,
+                         -1.75647175389045657003f,
+                         0.688243882994381274313f,
+                         -0.107254423828329604454f);
  #elif POW_POLY_DEGREE == 3
-    mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+        mantissa = POLY2(frac,
+                         2.28330284476918490682f,
+                         -1.04913055217340124191f,
+                         0.204446009836232697516f);
  #else
  #error
  #endif
  
-    logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
-    logarithm = _mm_mul_ps(logarithm, ln2);
+        logarithm =
+            _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
+        logarithm = _mm_mul_ps(logarithm, ln2);
  
  
-    // Now calculate b*lna
-    bVal = _mm_load_ps(bPtr);
-    bVal = _mm_mul_ps(bVal, logarithm);
+        // Now calculate b*lna
+        bVal = _mm_load_ps(bPtr);
+        bVal = _mm_mul_ps(bVal, logarithm);
  
-    // Now compute exp(b*lna)
-    bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
+        // Now compute exp(b*lna)
+        bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
  
-    fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
+        fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
  
-    emm0 = _mm_cvttps_epi32(fx);
-    tmp = _mm_cvtepi32_ps(emm0);
+        emm0 = _mm_cvttps_epi32(fx);
+        tmp = _mm_cvtepi32_ps(emm0);
  
-    mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
-    fx = _mm_sub_ps(tmp, mask);
+        mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
+        fx = _mm_sub_ps(tmp, mask);
  
-    tmp = _mm_mul_ps(fx, exp_C1);
-    z = _mm_mul_ps(fx, exp_C2);
-    bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
-    z = _mm_mul_ps(bVal, bVal);
+        tmp = _mm_mul_ps(fx, exp_C1);
+        z = _mm_mul_ps(fx, exp_C2);
+        bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
+        z = _mm_mul_ps(bVal, bVal);
  
-    y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
-    y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
-    y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
-    y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
-    y = _mm_add_ps(y, one);
+        y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
+        y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
+        y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
+        y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
+        y = _mm_add_ps(y, one);
  
-    emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
+        emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
  
          pow2n = _mm_castsi128_ps(emm0);
          cVal = _mm_mul_ps(y, pow2n);
@@ -417,12 +507,12 @@ volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float* bVector,
          aPtr += 4;
          bPtr += 4;
          cPtr += 4;
-  }
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *cPtr++ = powf(*aPtr++, *bPtr++);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = powf(*aPtr++, *bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_SSE4_1 for aligned */
@@ -432,27 +522,28 @@ volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float* bVector,
  #ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
  #define INCLUDED_volk_32f_x2_pow_32f_u_H
  
-#include <stdio.h>
-#include <stdlib.h>
  #include <inttypes.h>
  #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
  
  #define POW_POLY_DEGREE 3
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_x2_pow_32f_generic(float* cVector, const float* bVector,
-                            const float* aVector, unsigned int num_points)
+static inline void volk_32f_x2_pow_32f_generic(float* cVector,
+                                               const float* bVector,
+                                               const float* aVector,
+                                               unsigned int num_points)
  {
-  float* cPtr = cVector;
-  const float* bPtr = bVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = powf(*aPtr++, *bPtr++);
-  }
+    float* cPtr = cVector;
+    const float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = powf(*aPtr++, *bPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -463,112 +554,139 @@ volk_32f_x2_pow_32f_generic(float* cVector, const float* bVector,
  #define POLY0(x, c0) _mm_set1_ps(c0)
  #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
  #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
-#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
-#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
-#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
-
-static inline void
-volk_32f_x2_pow_32f_u_sse4_1(float* cVector, const float* bVector,
-                             const float* aVector, unsigned int num_points)
+#define POLY3(x, c0, c1, c2, c3) \
+    _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) \
+    _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) \
+    _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+static inline void volk_32f_x2_pow_32f_u_sse4_1(float* cVector,
+                                                const float* bVector,
+                                                const float* aVector,
+                                                unsigned int num_points)
  {
-  float* cPtr = cVector;
-  const float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
-  __m128 tmp, fx, mask, pow2n, z, y;
-  __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
-  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
-  __m128i bias, exp, emm0, pi32_0x7f;
-
-  one = _mm_set1_ps(1.0);
-  exp_hi = _mm_set1_ps(88.3762626647949);
-  exp_lo = _mm_set1_ps(-88.3762626647949);
-  ln2 = _mm_set1_ps(0.6931471805);
-  log2EF = _mm_set1_ps(1.44269504088896341);
-  half = _mm_set1_ps(0.5);
-  exp_C1 = _mm_set1_ps(0.693359375);
-  exp_C2 = _mm_set1_ps(-2.12194440e-4);
-  pi32_0x7f = _mm_set1_epi32(0x7f);
-
-  exp_p0 = _mm_set1_ps(1.9875691500e-4);
-  exp_p1 = _mm_set1_ps(1.3981999507e-3);
-  exp_p2 = _mm_set1_ps(8.3334519073e-3);
-  exp_p3 = _mm_set1_ps(4.1665795894e-2);
-  exp_p4 = _mm_set1_ps(1.6666665459e-1);
-  exp_p5 = _mm_set1_ps(5.0000001201e-1);
-
-  for(;number < quarterPoints; number++){
-    // First compute the logarithm
-    aVal = _mm_loadu_ps(aPtr);
-    bias = _mm_set1_epi32(127);
-    leadingOne = _mm_set1_ps(1.0f);
-    exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
-    logarithm = _mm_cvtepi32_ps(exp);
-
-    frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+    float* cPtr = cVector;
+    const float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+    __m128 tmp, fx, mask, pow2n, z, y;
+    __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+    __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+    __m128i bias, exp, emm0, pi32_0x7f;
+
+    one = _mm_set1_ps(1.0);
+    exp_hi = _mm_set1_ps(88.3762626647949);
+    exp_lo = _mm_set1_ps(-88.3762626647949);
+    ln2 = _mm_set1_ps(0.6931471805);
+    log2EF = _mm_set1_ps(1.44269504088896341);
+    half = _mm_set1_ps(0.5);
+    exp_C1 = _mm_set1_ps(0.693359375);
+    exp_C2 = _mm_set1_ps(-2.12194440e-4);
+    pi32_0x7f = _mm_set1_epi32(0x7f);
+
+    exp_p0 = _mm_set1_ps(1.9875691500e-4);
+    exp_p1 = _mm_set1_ps(1.3981999507e-3);
+    exp_p2 = _mm_set1_ps(8.3334519073e-3);
+    exp_p3 = _mm_set1_ps(4.1665795894e-2);
+    exp_p4 = _mm_set1_ps(1.6666665459e-1);
+    exp_p5 = _mm_set1_ps(5.0000001201e-1);
+
+    for (; number < quarterPoints; number++) {
+        // First compute the logarithm
+        aVal = _mm_loadu_ps(aPtr);
+        bias = _mm_set1_epi32(127);
+        leadingOne = _mm_set1_ps(1.0f);
+        exp = _mm_sub_epi32(
+            _mm_srli_epi32(
+                _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
+            bias);
+        logarithm = _mm_cvtepi32_ps(exp);
+
+        frac = _mm_or_ps(leadingOne,
+                         _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
  
  #if POW_POLY_DEGREE == 6
-    mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f,  3.1821337e-1f, -3.4436006e-2f);
+        mantissa = POLY5(frac,
+                         3.1157899f,
+                         -3.3241990f,
+                         2.5988452f,
+                         -1.2315303f,
+                         3.1821337e-1f,
+                         -3.4436006e-2f);
  #elif POW_POLY_DEGREE == 5
-    mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+        mantissa = POLY4(frac,
+                         2.8882704548164776201f,
+                         -2.52074962577807006663f,
+                         1.48116647521213171641f,
+                         -0.465725644288844778798f,
+                         0.0596515482674574969533f);
  #elif POW_POLY_DEGREE == 4
-    mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+        mantissa = POLY3(frac,
+                         2.61761038894603480148f,
+                         -1.75647175389045657003f,
+                         0.688243882994381274313f,
+                         -0.107254423828329604454f);
  #elif POW_POLY_DEGREE == 3
-    mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+        mantissa = POLY2(frac,
+                         2.28330284476918490682f,
+                         -1.04913055217340124191f,
+                         0.204446009836232697516f);
  #else
  #error
  #endif
  
-    logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
-    logarithm = _mm_mul_ps(logarithm, ln2);
+        logarithm =
+            _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
+        logarithm = _mm_mul_ps(logarithm, ln2);
  
  
-    // Now calculate b*lna
-    bVal = _mm_loadu_ps(bPtr);
-    bVal = _mm_mul_ps(bVal, logarithm);
+        // Now calculate b*lna
+        bVal = _mm_loadu_ps(bPtr);
+        bVal = _mm_mul_ps(bVal, logarithm);
  
-    // Now compute exp(b*lna)
-    bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
+        // Now compute exp(b*lna)
+        bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
  
-    fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
+        fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
  
-    emm0 = _mm_cvttps_epi32(fx);
-    tmp = _mm_cvtepi32_ps(emm0);
+        emm0 = _mm_cvttps_epi32(fx);
+        tmp = _mm_cvtepi32_ps(emm0);
  
-    mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
-    fx = _mm_sub_ps(tmp, mask);
+        mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
+        fx = _mm_sub_ps(tmp, mask);
  
-    tmp = _mm_mul_ps(fx, exp_C1);
-    z = _mm_mul_ps(fx, exp_C2);
-    bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
-    z = _mm_mul_ps(bVal, bVal);
+        tmp = _mm_mul_ps(fx, exp_C1);
+        z = _mm_mul_ps(fx, exp_C2);
+        bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
+        z = _mm_mul_ps(bVal, bVal);
  
-    y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
-    y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
-    y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
-    y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
-    y = _mm_add_ps(y, one);
+        y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
+        y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
+        y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
+        y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
+        y = _mm_add_ps(y, one);
  
-    emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
+        emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
  
-    pow2n = _mm_castsi128_ps(emm0);
-    cVal = _mm_mul_ps(y, pow2n);
+        pow2n = _mm_castsi128_ps(emm0);
+        cVal = _mm_mul_ps(y, pow2n);
  
-    _mm_storeu_ps(cPtr, cVal);
+        _mm_storeu_ps(cPtr, cVal);
  
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *cPtr++ = powf(*aPtr++, *bPtr++);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = powf(*aPtr++, *bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_SSE4_1 for unaligned */
@@ -577,100 +695,131 @@ volk_32f_x2_pow_32f_u_sse4_1(float* cVector, const float* bVector,
  #include <immintrin.h>
  
  #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
-#define POLY1_AVX2_FMA(x, c0, c1) _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
-#define POLY2_AVX2_FMA(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
-#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
-#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
-#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
-
-static inline void
-volk_32f_x2_pow_32f_u_avx2_fma(float* cVector, const float* bVector,
-                             const float* aVector, unsigned int num_points)
+#define POLY1_AVX2_FMA(x, c0, c1) \
+    _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
+#define POLY2_AVX2_FMA(x, c0, c1, c2) \
+    _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
+#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
+    _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
+#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
+    _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
+#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
+    _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
+
+static inline void volk_32f_x2_pow_32f_u_avx2_fma(float* cVector,
+                                                  const float* bVector,
+                                                  const float* aVector,
+                                                  unsigned int num_points)
  {
-  float* cPtr = cVector;
-  const float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
-  __m256 tmp, fx, mask, pow2n, z, y;
-  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
-  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
-  __m256i bias, exp, emm0, pi32_0x7f;
-
-  one = _mm256_set1_ps(1.0);
-  exp_hi = _mm256_set1_ps(88.3762626647949);
-  exp_lo = _mm256_set1_ps(-88.3762626647949);
-  ln2 = _mm256_set1_ps(0.6931471805);
-  log2EF = _mm256_set1_ps(1.44269504088896341);
-  half = _mm256_set1_ps(0.5);
-  exp_C1 = _mm256_set1_ps(0.693359375);
-  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
-  pi32_0x7f = _mm256_set1_epi32(0x7f);
-
-  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
-  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
-  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
-  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
-  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
-  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
-
-  for(;number < eighthPoints; number++){
-    // First compute the logarithm
-    aVal = _mm256_loadu_ps(aPtr);
-    bias = _mm256_set1_epi32(127);
-    leadingOne = _mm256_set1_ps(1.0f);
-    exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
-    logarithm = _mm256_cvtepi32_ps(exp);
-
-    frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+    float* cPtr = cVector;
+    const float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+    __m256 tmp, fx, mask, pow2n, z, y;
+    __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+    __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+    __m256i bias, exp, emm0, pi32_0x7f;
+
+    one = _mm256_set1_ps(1.0);
+    exp_hi = _mm256_set1_ps(88.3762626647949);
+    exp_lo = _mm256_set1_ps(-88.3762626647949);
+    ln2 = _mm256_set1_ps(0.6931471805);
+    log2EF = _mm256_set1_ps(1.44269504088896341);
+    half = _mm256_set1_ps(0.5);
+    exp_C1 = _mm256_set1_ps(0.693359375);
+    exp_C2 = _mm256_set1_ps(-2.12194440e-4);
+    pi32_0x7f = _mm256_set1_epi32(0x7f);
+
+    exp_p0 = _mm256_set1_ps(1.9875691500e-4);
+    exp_p1 = _mm256_set1_ps(1.3981999507e-3);
+    exp_p2 = _mm256_set1_ps(8.3334519073e-3);
+    exp_p3 = _mm256_set1_ps(4.1665795894e-2);
+    exp_p4 = _mm256_set1_ps(1.6666665459e-1);
+    exp_p5 = _mm256_set1_ps(5.0000001201e-1);
+
+    for (; number < eighthPoints; number++) {
+        // First compute the logarithm
+        aVal = _mm256_loadu_ps(aPtr);
+        bias = _mm256_set1_epi32(127);
+        leadingOne = _mm256_set1_ps(1.0f);
+        exp = _mm256_sub_epi32(
+            _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
+                                               _mm256_set1_epi32(0x7f800000)),
+                              23),
+            bias);
+        logarithm = _mm256_cvtepi32_ps(exp);
+
+        frac = _mm256_or_ps(
+            leadingOne,
+            _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
  
  #if POW_POLY_DEGREE == 6
-    mantissa = POLY5_AVX2_FMA( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f,  3.1821337e-1f, -3.4436006e-2f);
+        mantissa = POLY5_AVX2_FMA(frac,
+                                  3.1157899f,
+                                  -3.3241990f,
+                                  2.5988452f,
+                                  -1.2315303f,
+                                  3.1821337e-1f,
+                                  -3.4436006e-2f);
  #elif POW_POLY_DEGREE == 5
-    mantissa = POLY4_AVX2_FMA( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+        mantissa = POLY4_AVX2_FMA(frac,
+                                  2.8882704548164776201f,
+                                  -2.52074962577807006663f,
+                                  1.48116647521213171641f,
+                                  -0.465725644288844778798f,
+                                  0.0596515482674574969533f);
  #elif POW_POLY_DEGREE == 4
-    mantissa = POLY3_AVX2_FMA( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+        mantissa = POLY3_AVX2_FMA(frac,
+                                  2.61761038894603480148f,
+                                  -1.75647175389045657003f,
+                                  0.688243882994381274313f,
+                                  -0.107254423828329604454f);
  #elif POW_POLY_DEGREE == 3
-    mantissa = POLY2_AVX2_FMA( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+        mantissa = POLY2_AVX2_FMA(frac,
+                                  2.28330284476918490682f,
+                                  -1.04913055217340124191f,
+                                  0.204446009836232697516f);
  #else
  #error
  #endif
  
-    logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
-    logarithm = _mm256_mul_ps(logarithm, ln2);
+        logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
+        logarithm = _mm256_mul_ps(logarithm, ln2);
  
  
-    // Now calculate b*lna
-    bVal = _mm256_loadu_ps(bPtr);
-    bVal = _mm256_mul_ps(bVal, logarithm);
+        // Now calculate b*lna
+        bVal = _mm256_loadu_ps(bPtr);
+        bVal = _mm256_mul_ps(bVal, logarithm);
  
-    // Now compute exp(b*lna)
-    bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
+        // Now compute exp(b*lna)
+        bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
  
-    fx = _mm256_fmadd_ps(bVal, log2EF, half);
+        fx = _mm256_fmadd_ps(bVal, log2EF, half);
  
-    emm0 = _mm256_cvttps_epi32(fx);
-    tmp = _mm256_cvtepi32_ps(emm0);
+        emm0 = _mm256_cvttps_epi32(fx);
+        tmp = _mm256_cvtepi32_ps(emm0);
  
-    mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
-    fx = _mm256_sub_ps(tmp, mask);
+        mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
+        fx = _mm256_sub_ps(tmp, mask);
  
-    tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
-    bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
-    z = _mm256_mul_ps(bVal, bVal);
+        tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
+        bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
+        z = _mm256_mul_ps(bVal, bVal);
  
-    y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
-    y = _mm256_fmadd_ps(y, bVal, exp_p2);
-    y = _mm256_fmadd_ps(y, bVal, exp_p3);
-    y = _mm256_fmadd_ps(y, bVal, exp_p4);
-    y = _mm256_fmadd_ps(y, bVal, exp_p5);
-    y = _mm256_fmadd_ps(y, z, bVal);
-    y = _mm256_add_ps(y, one);
+        y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
+        y = _mm256_fmadd_ps(y, bVal, exp_p2);
+        y = _mm256_fmadd_ps(y, bVal, exp_p3);
+        y = _mm256_fmadd_ps(y, bVal, exp_p4);
+        y = _mm256_fmadd_ps(y, bVal, exp_p5);
+        y = _mm256_fmadd_ps(y, z, bVal);
+        y = _mm256_add_ps(y, one);
  
-    emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
+        emm0 =
+            _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
  
          pow2n = _mm256_castsi256_ps(emm0);
          cVal = _mm256_mul_ps(y, pow2n);
@@ -680,12 +829,12 @@ volk_32f_x2_pow_32f_u_avx2_fma(float* cVector, const float* bVector,
          aPtr += 8;
          bPtr += 8;
          cPtr += 8;
-  }
+    }
  
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *cPtr++ = pow(*aPtr++, *bPtr++);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = pow(*aPtr++, *bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
@@ -694,99 +843,131 @@ volk_32f_x2_pow_32f_u_avx2_fma(float* cVector, const float* bVector,
  #include <immintrin.h>
  
  #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
-#define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
-#define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
-#define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
-#define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
-#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
-
-static inline void
-volk_32f_x2_pow_32f_u_avx2(float* cVector, const float* bVector,
-                             const float* aVector, unsigned int num_points)
+#define POLY1_AVX2(x, c0, c1) \
+    _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
+#define POLY2_AVX2(x, c0, c1, c2) \
+    _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
+#define POLY3_AVX2(x, c0, c1, c2, c3) \
+    _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
+#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
+    _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
+#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
+    _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
+
+static inline void volk_32f_x2_pow_32f_u_avx2(float* cVector,
+                                              const float* bVector,
+                                              const float* aVector,
+                                              unsigned int num_points)
  {
-  float* cPtr = cVector;
-  const float* bPtr = bVector;
-  const float* aPtr = aVector;
-
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
-  __m256 tmp, fx, mask, pow2n, z, y;
-  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
-  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
-  __m256i bias, exp, emm0, pi32_0x7f;
-
-  one = _mm256_set1_ps(1.0);
-  exp_hi = _mm256_set1_ps(88.3762626647949);
-  exp_lo = _mm256_set1_ps(-88.3762626647949);
-  ln2 = _mm256_set1_ps(0.6931471805);
-  log2EF = _mm256_set1_ps(1.44269504088896341);
-  half = _mm256_set1_ps(0.5);
-  exp_C1 = _mm256_set1_ps(0.693359375);
-  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
-  pi32_0x7f = _mm256_set1_epi32(0x7f);
-
-  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
-  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
-  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
-  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
-  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
-  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
-
-  for(;number < eighthPoints; number++){
-    // First compute the logarithm
-    aVal = _mm256_loadu_ps(aPtr);
-    bias = _mm256_set1_epi32(127);
-    leadingOne = _mm256_set1_ps(1.0f);
-    exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
-    logarithm = _mm256_cvtepi32_ps(exp);
-
-    frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+    float* cPtr = cVector;
+    const float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+    __m256 tmp, fx, mask, pow2n, z, y;
+    __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+    __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+    __m256i bias, exp, emm0, pi32_0x7f;
+
+    one = _mm256_set1_ps(1.0);
+    exp_hi = _mm256_set1_ps(88.3762626647949);
+    exp_lo = _mm256_set1_ps(-88.3762626647949);
+    ln2 = _mm256_set1_ps(0.6931471805);
+    log2EF = _mm256_set1_ps(1.44269504088896341);
+    half = _mm256_set1_ps(0.5);
+    exp_C1 = _mm256_set1_ps(0.693359375);
+    exp_C2 = _mm256_set1_ps(-2.12194440e-4);
+    pi32_0x7f = _mm256_set1_epi32(0x7f);
+
+    exp_p0 = _mm256_set1_ps(1.9875691500e-4);
+    exp_p1 = _mm256_set1_ps(1.3981999507e-3);
+    exp_p2 = _mm256_set1_ps(8.3334519073e-3);
+    exp_p3 = _mm256_set1_ps(4.1665795894e-2);
+    exp_p4 = _mm256_set1_ps(1.6666665459e-1);
+    exp_p5 = _mm256_set1_ps(5.0000001201e-1);
+
+    for (; number < eighthPoints; number++) {
+        // First compute the logarithm
+        aVal = _mm256_loadu_ps(aPtr);
+        bias = _mm256_set1_epi32(127);
+        leadingOne = _mm256_set1_ps(1.0f);
+        exp = _mm256_sub_epi32(
+            _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
+                                               _mm256_set1_epi32(0x7f800000)),
+                              23),
+            bias);
+        logarithm = _mm256_cvtepi32_ps(exp);
+
+        frac = _mm256_or_ps(
+            leadingOne,
+            _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
  
  #if POW_POLY_DEGREE == 6
-    mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f,  3.1821337e-1f, -3.4436006e-2f);
+        mantissa = POLY5_AVX2(frac,
+                              3.1157899f,
+                              -3.3241990f,
+                              2.5988452f,
+                              -1.2315303f,
+                              3.1821337e-1f,
+                              -3.4436006e-2f);
  #elif POW_POLY_DEGREE == 5
-    mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+        mantissa = POLY4_AVX2(frac,
+                              2.8882704548164776201f,
+                              -2.52074962577807006663f,
+                              1.48116647521213171641f,
+                              -0.465725644288844778798f,
+                              0.0596515482674574969533f);
  #elif POW_POLY_DEGREE == 4
-    mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+        mantissa = POLY3_AVX2(frac,
+                              2.61761038894603480148f,
+                              -1.75647175389045657003f,
+                              0.688243882994381274313f,
+                              -0.107254423828329604454f);
  #elif POW_POLY_DEGREE == 3
-    mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+        mantissa = POLY2_AVX2(frac,
+                              2.28330284476918490682f,
+                              -1.04913055217340124191f,
+                              0.204446009836232697516f);
  #else
  #error
  #endif
  
-    logarithm = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
-    logarithm = _mm256_mul_ps(logarithm, ln2);
+        logarithm = _mm256_add_ps(
+            _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
+        logarithm = _mm256_mul_ps(logarithm, ln2);
  
-    // Now calculate b*lna
-    bVal = _mm256_loadu_ps(bPtr);
-    bVal = _mm256_mul_ps(bVal, logarithm);
+        // Now calculate b*lna
+        bVal = _mm256_loadu_ps(bPtr);
+        bVal = _mm256_mul_ps(bVal, logarithm);
  
-    // Now compute exp(b*lna)
-    bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
+        // Now compute exp(b*lna)
+        bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
  
-    fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
+        fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
  
-    emm0 = _mm256_cvttps_epi32(fx);
-    tmp = _mm256_cvtepi32_ps(emm0);
+        emm0 = _mm256_cvttps_epi32(fx);
+        tmp = _mm256_cvtepi32_ps(emm0);
  
-    mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
-    fx = _mm256_sub_ps(tmp, mask);
+        mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
+        fx = _mm256_sub_ps(tmp, mask);
  
-    tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
-    bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
-    z = _mm256_mul_ps(bVal, bVal);
+        tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
+        bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
+        z = _mm256_mul_ps(bVal, bVal);
  
-    y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
-    y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
-    y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
-    y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
-    y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
-    y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
-    y = _mm256_add_ps(y, one);
+        y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
+        y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
+        y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
+        y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
+        y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
+        y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
+        y = _mm256_add_ps(y, one);
  
-    emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
+        emm0 =
+            _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
  
          pow2n = _mm256_castsi256_ps(emm0);
          cVal = _mm256_mul_ps(y, pow2n);
@@ -796,12 +977,12 @@ volk_32f_x2_pow_32f_u_avx2(float* cVector, const float* bVector,
          aPtr += 8;
          bPtr += 8;
          cPtr += 8;
-  }
+    }
  
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *cPtr++ = pow(*aPtr++, *bPtr++);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = pow(*aPtr++, *bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX2 for unaligned */
diff --git a/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h b/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h

index 8021fafd7b549d0eacb5719edb0851ac43ea9710..04e58924ab8fd541f8446bee9fa9f8d10666ed2b 100644 (file)
--- a/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h
+++ b/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h
@@ -32,8 +32,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_x2_s32f_interleave_16ic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points)
- * \endcode
+ * void volk_32f_x2_s32f_interleave_16ic(lv_16sc_t* complexVector, const float* iBuffer,
+ * const float* qBuffer, const float scalar, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li iBuffer: Input vector of samples for the real part.
@@ -75,60 +75,62 @@
  #ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
  #define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
  #include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_s32f_interleave_16ic_a_avx2(lv_16sc_t* complexVector, const float* iBuffer,
-                                        const float* qBuffer, const float scalar, unsigned int num_points)
+static inline void volk_32f_x2_s32f_interleave_16ic_a_avx2(lv_16sc_t* complexVector,
+                                                           const float* iBuffer,
+                                                           const float* qBuffer,
+                                                           const float scalar,
+                                                           unsigned int num_points)
  {
-  unsigned int number = 0;
-  const float* iBufferPtr = iBuffer;
-  const float* qBufferPtr = qBuffer;
+    unsigned int number = 0;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
  
-  __m256 vScalar = _mm256_set1_ps(scalar);
+    __m256 vScalar = _mm256_set1_ps(scalar);
  
-  const unsigned int eighthPoints = num_points / 8;
+    const unsigned int eighthPoints = num_points / 8;
  
-  __m256 iValue, qValue, cplxValue1, cplxValue2;
-  __m256i intValue1, intValue2;
+    __m256 iValue, qValue, cplxValue1, cplxValue2;
+    __m256i intValue1, intValue2;
  
-  int16_t* complexVectorPtr = (int16_t*)complexVector;
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
  
-  for(;number < eighthPoints; number++){
-    iValue = _mm256_load_ps(iBufferPtr);
-    qValue = _mm256_load_ps(qBufferPtr);
+    for (; number < eighthPoints; number++) {
+        iValue = _mm256_load_ps(iBufferPtr);
+        qValue = _mm256_load_ps(qBufferPtr);
  
-    // Interleaves the lower two values in the i and q variables into one buffer
-    cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
-    cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
+        // Interleaves the lower two values in the i and q variables into one buffer
+        cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
+        cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
  
-    // Interleaves the upper two values in the i and q variables into one buffer
-    cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
-    cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
+        // Interleaves the upper two values in the i and q variables into one buffer
+        cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
+        cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
  
-    intValue1 = _mm256_cvtps_epi32(cplxValue1);
-    intValue2 = _mm256_cvtps_epi32(cplxValue2);
+        intValue1 = _mm256_cvtps_epi32(cplxValue1);
+        intValue2 = _mm256_cvtps_epi32(cplxValue2);
  
-    intValue1 = _mm256_packs_epi32(intValue1, intValue2);
+        intValue1 = _mm256_packs_epi32(intValue1, intValue2);
  
-    _mm256_store_si256((__m256i*)complexVectorPtr, intValue1);
-    complexVectorPtr += 16;
+        _mm256_store_si256((__m256i*)complexVectorPtr, intValue1);
+        complexVectorPtr += 16;
  
-    iBufferPtr += 8;
-    qBufferPtr += 8;
-  }
+        iBufferPtr += 8;
+        qBufferPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  complexVectorPtr = (int16_t*)(&complexVector[number]);
-  for(; number < num_points; number++){
-    *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
-    *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
-  }
+    number = eighthPoints * 8;
+    complexVectorPtr = (int16_t*)(&complexVector[number]);
+    for (; number < num_points; number++) {
+        *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
+        *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -136,53 +138,55 @@ volk_32f_x2_s32f_interleave_16ic_a_avx2(lv_16sc_t* complexVector, const float* i
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void
-volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector, const float* iBuffer,
-                                        const float* qBuffer, const float scalar, unsigned int num_points)
+static inline void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector,
+                                                           const float* iBuffer,
+                                                           const float* qBuffer,
+                                                           const float scalar,
+                                                           unsigned int num_points)
  {
-  unsigned int number = 0;
-  const float* iBufferPtr = iBuffer;
-  const float* qBufferPtr = qBuffer;
+    unsigned int number = 0;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
  
-  __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 vScalar = _mm_set_ps1(scalar);
  
-  const unsigned int quarterPoints = num_points / 4;
+    const unsigned int quarterPoints = num_points / 4;
  
-  __m128 iValue, qValue, cplxValue1, cplxValue2;
-  __m128i intValue1, intValue2;
+    __m128 iValue, qValue, cplxValue1, cplxValue2;
+    __m128i intValue1, intValue2;
  
-  int16_t* complexVectorPtr = (int16_t*)complexVector;
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
  
-  for(;number < quarterPoints; number++){
-    iValue = _mm_load_ps(iBufferPtr);
-    qValue = _mm_load_ps(qBufferPtr);
+    for (; number < quarterPoints; number++) {
+        iValue = _mm_load_ps(iBufferPtr);
+        qValue = _mm_load_ps(qBufferPtr);
  
-    // Interleaves the lower two values in the i and q variables into one buffer
-    cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
-    cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
+        // Interleaves the lower two values in the i and q variables into one buffer
+        cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
+        cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
  
-    // Interleaves the upper two values in the i and q variables into one buffer
-    cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
-    cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
+        // Interleaves the upper two values in the i and q variables into one buffer
+        cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
+        cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
  
-    intValue1 = _mm_cvtps_epi32(cplxValue1);
-    intValue2 = _mm_cvtps_epi32(cplxValue2);
+        intValue1 = _mm_cvtps_epi32(cplxValue1);
+        intValue2 = _mm_cvtps_epi32(cplxValue2);
  
-    intValue1 = _mm_packs_epi32(intValue1, intValue2);
+        intValue1 = _mm_packs_epi32(intValue1, intValue2);
  
-    _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
-    complexVectorPtr += 8;
+        _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
+        complexVectorPtr += 8;
  
-    iBufferPtr += 4;
-    qBufferPtr += 4;
-  }
+        iBufferPtr += 4;
+        qBufferPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  complexVectorPtr = (int16_t*)(&complexVector[number]);
-  for(; number < num_points; number++){
-    *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
-    *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
-  }
+    number = quarterPoints * 4;
+    complexVectorPtr = (int16_t*)(&complexVector[number]);
+    for (; number < num_points; number++) {
+        *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
+        *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
+    }
  }
  #endif /* LV_HAVE_SSE2 */
  
@@ -190,79 +194,83 @@ volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector, const float* i
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVector, const float* iBuffer,
-                                       const float* qBuffer, const float scalar, unsigned int num_points)
+static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVector,
+                                                          const float* iBuffer,
+                                                          const float* qBuffer,
+                                                          const float scalar,
+                                                          unsigned int num_points)
  {
-  unsigned int number = 0;
-  const float* iBufferPtr = iBuffer;
-  const float* qBufferPtr = qBuffer;
+    unsigned int number = 0;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
  
-  __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 vScalar = _mm_set_ps1(scalar);
  
-  const unsigned int quarterPoints = num_points / 4;
+    const unsigned int quarterPoints = num_points / 4;
  
-  __m128 iValue, qValue, cplxValue;
+    __m128 iValue, qValue, cplxValue;
  
-  int16_t* complexVectorPtr = (int16_t*)complexVector;
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
  
-  __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+    __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
  
-  for(;number < quarterPoints; number++){
-    iValue = _mm_load_ps(iBufferPtr);
-    qValue = _mm_load_ps(qBufferPtr);
+    for (; number < quarterPoints; number++) {
+        iValue = _mm_load_ps(iBufferPtr);
+        qValue = _mm_load_ps(qBufferPtr);
  
-    // Interleaves the lower two values in the i and q variables into one buffer
-    cplxValue = _mm_unpacklo_ps(iValue, qValue);
-    cplxValue = _mm_mul_ps(cplxValue, vScalar);
+        // Interleaves the lower two values in the i and q variables into one buffer
+        cplxValue = _mm_unpacklo_ps(iValue, qValue);
+        cplxValue = _mm_mul_ps(cplxValue, vScalar);
  
-    _mm_store_ps(floatBuffer, cplxValue);
+        _mm_store_ps(floatBuffer, cplxValue);
  
-    *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
-    *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
-    *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
-    *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
+        *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
+        *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
+        *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
+        *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
  
-    // Interleaves the upper two values in the i and q variables into one buffer
-    cplxValue = _mm_unpackhi_ps(iValue, qValue);
-    cplxValue = _mm_mul_ps(cplxValue, vScalar);
+        // Interleaves the upper two values in the i and q variables into one buffer
+        cplxValue = _mm_unpackhi_ps(iValue, qValue);
+        cplxValue = _mm_mul_ps(cplxValue, vScalar);
  
-    _mm_store_ps(floatBuffer, cplxValue);
+        _mm_store_ps(floatBuffer, cplxValue);
  
-    *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
-    *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
-    *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
-    *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
+        *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
+        *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
+        *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
+        *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
  
-    iBufferPtr += 4;
-    qBufferPtr += 4;
-  }
+        iBufferPtr += 4;
+        qBufferPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  complexVectorPtr = (int16_t*)(&complexVector[number]);
-  for(; number < num_points; number++){
-    *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
-    *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
-  }
+    number = quarterPoints * 4;
+    complexVectorPtr = (int16_t*)(&complexVector[number]);
+    for (; number < num_points; number++) {
+        *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
+        *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector, const float* iBuffer,
-                                         const float* qBuffer, const float scalar, unsigned int num_points)
+static inline void volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector,
+                                                            const float* iBuffer,
+                                                            const float* qBuffer,
+                                                            const float scalar,
+                                                            unsigned int num_points)
  {
-  int16_t* complexVectorPtr = (int16_t*)complexVector;
-  const float* iBufferPtr = iBuffer;
-  const float* qBufferPtr = qBuffer;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
-    *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
-  }
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
+        *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -272,60 +280,62 @@ volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector, const float*
  #ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
  #define INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
  #include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVector, const float* iBuffer,
-                                        const float* qBuffer, const float scalar, unsigned int num_points)
+static inline void volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVector,
+                                                           const float* iBuffer,
+                                                           const float* qBuffer,
+                                                           const float scalar,
+                                                           unsigned int num_points)
  {
-  unsigned int number = 0;
-  const float* iBufferPtr = iBuffer;
-  const float* qBufferPtr = qBuffer;
+    unsigned int number = 0;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
  
-  __m256 vScalar = _mm256_set1_ps(scalar);
+    __m256 vScalar = _mm256_set1_ps(scalar);
  
-  const unsigned int eighthPoints = num_points / 8;
+    const unsigned int eighthPoints = num_points / 8;
  
-  __m256 iValue, qValue, cplxValue1, cplxValue2;
-  __m256i intValue1, intValue2;
+    __m256 iValue, qValue, cplxValue1, cplxValue2;
+    __m256i intValue1, intValue2;
  
-  int16_t* complexVectorPtr = (int16_t*)complexVector;
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
  
-  for(;number < eighthPoints; number++){
-    iValue = _mm256_loadu_ps(iBufferPtr);
-    qValue = _mm256_loadu_ps(qBufferPtr);
+    for (; number < eighthPoints; number++) {
+        iValue = _mm256_loadu_ps(iBufferPtr);
+        qValue = _mm256_loadu_ps(qBufferPtr);
  
-    // Interleaves the lower two values in the i and q variables into one buffer
-    cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
-    cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
+        // Interleaves the lower two values in the i and q variables into one buffer
+        cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
+        cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
  
-    // Interleaves the upper two values in the i and q variables into one buffer
-    cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
-    cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
+        // Interleaves the upper two values in the i and q variables into one buffer
+        cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
+        cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
  
-    intValue1 = _mm256_cvtps_epi32(cplxValue1);
-    intValue2 = _mm256_cvtps_epi32(cplxValue2);
+        intValue1 = _mm256_cvtps_epi32(cplxValue1);
+        intValue2 = _mm256_cvtps_epi32(cplxValue2);
  
-    intValue1 = _mm256_packs_epi32(intValue1, intValue2);
+        intValue1 = _mm256_packs_epi32(intValue1, intValue2);
  
-    _mm256_storeu_si256((__m256i*)complexVectorPtr, intValue1);
-    complexVectorPtr += 16;
+        _mm256_storeu_si256((__m256i*)complexVectorPtr, intValue1);
+        complexVectorPtr += 16;
  
-    iBufferPtr += 8;
-    qBufferPtr += 8;
-  }
+        iBufferPtr += 8;
+        qBufferPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  complexVectorPtr = (int16_t*)(&complexVector[number]);
-  for(; number < num_points; number++){
-    *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
-    *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
-  }
+    number = eighthPoints * 8;
+    complexVectorPtr = (int16_t*)(&complexVector[number]);
+    for (; number < num_points; number++) {
+        *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
+        *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
diff --git a/kernels/volk/volk_32f_x2_subtract_32f.h b/kernels/volk/volk_32f_x2_subtract_32f.h

index bdfa0a1e4bf77771093a4f46cf83bc540db2f92e..359974cfef54f755cfdd427e9c864d9ee1df02c5 100644 (file)
--- a/kernels/volk/volk_32f_x2_subtract_32f.h
+++ b/kernels/volk/volk_32f_x2_subtract_32f.h
@@ -31,8 +31,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_x2_subtract_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
- * \endcode
+ * void volk_32f_x2_subtract_32f(float* cVector, const float* aVector, const float*
+ * bVector, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li aVector: The initial vector.
@@ -77,126 +77,130 @@
  #ifdef LV_HAVE_AVX512F
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_subtract_32f_a_avx512f(float* cVector, const float* aVector,
-                               const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_subtract_32f_a_avx512f(float* cVector,
+                                                      const float* aVector,
+                                                      const float* bVector,
+                                                      unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr = bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m512 aVal, bVal, cVal;
-  for(;number < sixteenthPoints; number++){
+    __m512 aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
  
-    aVal = _mm512_load_ps(aPtr);
-    bVal = _mm512_load_ps(bPtr);
+        aVal = _mm512_load_ps(aPtr);
+        bVal = _mm512_load_ps(bPtr);
  
-    cVal = _mm512_sub_ps(aVal, bVal);
+        cVal = _mm512_sub_ps(aVal, bVal);
  
-    _mm512_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 16;
-    bPtr += 16;
-    cPtr += 16;
-  }
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
  
-  number = sixteenthPoints *16;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) - (*bPtr++);
-  }
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) - (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_AVX512F */
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_subtract_32f_a_avx(float* cVector, const float* aVector,
-                               const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_subtract_32f_a_avx(float* cVector,
+                                                  const float* aVector,
+                                                  const float* bVector,
+                                                  unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr = bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m256 aVal, bVal, cVal;
-  for(;number < eighthPoints; number++){
+    __m256 aVal, bVal, cVal;
+    for (; number < eighthPoints; number++) {
  
-    aVal = _mm256_load_ps(aPtr);
-    bVal = _mm256_load_ps(bPtr);
+        aVal = _mm256_load_ps(aPtr);
+        bVal = _mm256_load_ps(bPtr);
  
-    cVal = _mm256_sub_ps(aVal, bVal);
+        cVal = _mm256_sub_ps(aVal, bVal);
  
-    _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) - (*bPtr++);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) - (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32f_x2_subtract_32f_a_sse(float* cVector, const float* aVector,
-                               const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_subtract_32f_a_sse(float* cVector,
+                                                  const float* aVector,
+                                                  const float* bVector,
+                                                  unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr = bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m128 aVal, bVal, cVal;
-  for(;number < quarterPoints; number++){
+    __m128 aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
  
-    aVal = _mm_load_ps(aPtr);
-    bVal = _mm_load_ps(bPtr);
+        aVal = _mm_load_ps(aPtr);
+        bVal = _mm_load_ps(bPtr);
  
-    cVal = _mm_sub_ps(aVal, bVal);
+        cVal = _mm_sub_ps(aVal, bVal);
  
-    _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) - (*bPtr++);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) - (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_x2_subtract_32f_generic(float* cVector, const float* aVector,
-                                 const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_subtract_32f_generic(float* cVector,
+                                                    const float* aVector,
+                                                    const float* bVector,
+                                                    unsigned int num_points)
  {
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr = bVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = (*aPtr++) - (*bPtr++);
-  }
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) - (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -204,45 +208,48 @@ volk_32f_x2_subtract_32f_generic(float* cVector, const float* aVector,
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_32f_x2_subtract_32f_neon(float* cVector, const float* aVector,
-                              const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_subtract_32f_neon(float* cVector,
+                                                 const float* aVector,
+                                                 const float* bVector,
+                                                 unsigned int num_points)
  {
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr = bVector;
-  unsigned int number = 0;
-  unsigned int quarter_points = num_points / 4;
-
-  float32x4_t a_vec, b_vec, c_vec;
-
-  for(number = 0; number < quarter_points; number++){
-    a_vec = vld1q_f32(aPtr);
-    b_vec = vld1q_f32(bPtr);
-    c_vec = vsubq_f32(a_vec, b_vec);
-    vst1q_f32(cPtr, c_vec);
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
-
-  for(number = quarter_points * 4; number < num_points; number++){
-    *cPtr++ = (*aPtr++) - (*bPtr++);
-  }
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+
+    float32x4_t a_vec, b_vec, c_vec;
+
+    for (number = 0; number < quarter_points; number++) {
+        a_vec = vld1q_f32(aPtr);
+        b_vec = vld1q_f32(bPtr);
+        c_vec = vsubq_f32(a_vec, b_vec);
+        vst1q_f32(cPtr, c_vec);
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) - (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_NEON */
  
  
  #ifdef LV_HAVE_ORC
-extern void
-volk_32f_x2_subtract_32f_a_orc_impl(float* cVector, const float* aVector,
-                                    const float* bVector, unsigned int num_points);
-
-static inline void
-volk_32f_x2_subtract_32f_u_orc(float* cVector, const float* aVector,
-                               const float* bVector, unsigned int num_points)
+extern void volk_32f_x2_subtract_32f_a_orc_impl(float* cVector,
+                                                const float* aVector,
+                                                const float* bVector,
+                                                unsigned int num_points);
+
+static inline void volk_32f_x2_subtract_32f_u_orc(float* cVector,
+                                                  const float* aVector,
+                                                  const float* bVector,
+                                                  unsigned int num_points)
  {
-  volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+    volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points);
  }
  #endif /* LV_HAVE_ORC */
  
@@ -259,36 +266,37 @@ volk_32f_x2_subtract_32f_u_orc(float* cVector, const float* aVector,
  #ifdef LV_HAVE_AVX512F
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_subtract_32f_u_avx512f(float* cVector, const float* aVector,
-                               const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_subtract_32f_u_avx512f(float* cVector,
+                                                      const float* aVector,
+                                                      const float* bVector,
+                                                      unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr = bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m512 aVal, bVal, cVal;
-  for(;number < sixteenthPoints; number++){
+    __m512 aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
  
-    aVal = _mm512_loadu_ps(aPtr);
-    bVal = _mm512_loadu_ps(bPtr);
+        aVal = _mm512_loadu_ps(aPtr);
+        bVal = _mm512_loadu_ps(bPtr);
  
-    cVal = _mm512_sub_ps(aVal, bVal);
+        cVal = _mm512_sub_ps(aVal, bVal);
  
-    _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container
+        _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 16;
-    bPtr += 16;
-    cPtr += 16;
-  }
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
  
-  number = sixteenthPoints *16;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) - (*bPtr++);
-  }
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) - (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_AVX512F */
  
@@ -296,36 +304,37 @@ volk_32f_x2_subtract_32f_u_avx512f(float* cVector, const float* aVector,
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32f_x2_subtract_32f_u_avx(float* cVector, const float* aVector,
-                               const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_subtract_32f_u_avx(float* cVector,
+                                                  const float* aVector,
+                                                  const float* bVector,
+                                                  unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  const float* bPtr = bVector;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m256 aVal, bVal, cVal;
-  for(;number < eighthPoints; number++){
+    __m256 aVal, bVal, cVal;
+    for (; number < eighthPoints; number++) {
  
-    aVal = _mm256_loadu_ps(aPtr);
-    bVal = _mm256_loadu_ps(bPtr);
+        aVal = _mm256_loadu_ps(aPtr);
+        bVal = _mm256_loadu_ps(bPtr);
  
-    cVal = _mm256_sub_ps(aVal, bVal);
+        cVal = _mm256_sub_ps(aVal, bVal);
  
-    _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+        _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) - (*bPtr++);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) - (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
diff --git a/kernels/volk/volk_32f_x3_sum_of_poly_32f.h b/kernels/volk/volk_32f_x3_sum_of_poly_32f.h

index e74a385d0e8902f38086d2c06b39cb741bf8e146..b0b14668bf05604f38ed1598b012c9dfe5435b3f 100644 (file)
--- a/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
+++ b/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
@@ -30,12 +30,13 @@
   * multiply by the rectangle/bin width.
   *
   * Expressed as a formula, this function calculates
- * \f$ \sum f(x) = \sum (c_0 + c_1 \cdot x + c_2 \cdot x^2 + c_3 \cdot x^3 + c_4 \cdot x^4)\f$
+ * \f$ \sum f(x) = \sum (c_0 + c_1 \cdot x + c_2 \cdot x^2 + c_3 \cdot x^3 + c_4 \cdot
+ * x^4)\f$
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32f_x3_sum_of_poly_32f(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points)
- * \endcode
+ * void volk_32f_x3_sum_of_poly_32f(float* target, float* src0, float* center_point_array,
+ * float* cutoff, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li src0: x values
@@ -53,9 +54,10 @@
   * \code
   *   int npoints = 4096;
   *   float* coefficients = (float*)volk_malloc(sizeof(float) * 5, volk_get_alignment());
- *   float* input        = (float*)volk_malloc(sizeof(float) * npoints, volk_get_alignment());
- *   float* result       = (float*)volk_malloc(sizeof(float), volk_get_alignment());
- *   float* cutoff       = (float*)volk_malloc(sizeof(float), volk_get_alignment());
+ *   float* input        = (float*)volk_malloc(sizeof(float) * npoints,
+ * volk_get_alignment()); float* result       = (float*)volk_malloc(sizeof(float),
+ * volk_get_alignment()); float* cutoff       = (float*)volk_malloc(sizeof(float),
+ * volk_get_alignment());
   *   // load precomputed Taylor series coefficients
   *   coefficients[0] = 4.48168907033806f;            // c1
   *   coefficients[1] = coefficients[0] * 0.5f;       // c2
@@ -82,288 +84,291 @@
  #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
  #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
  
-#include<inttypes.h>
-#include<stdio.h>
-#include<volk/volk_complex.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
  
  #ifndef MAX
-#define MAX(X,Y) ((X) > (Y)?(X):(Y))
+#define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
  #endif
  
  #ifdef LV_HAVE_SSE3
-#include<xmmintrin.h>
-#include<pmmintrin.h>
-
-static inline void
-volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array,
-                                   float* cutoff, unsigned int num_points)
+#include <pmmintrin.h>
+#include <xmmintrin.h>
+
+static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target,
+                                                      float* src0,
+                                                      float* center_point_array,
+                                                      float* cutoff,
+                                                      unsigned int num_points)
  {
-  float result = 0.0f;
-  float fst    = 0.0f;
-  float sq     = 0.0f;
-  float thrd   = 0.0f;
-  float frth   = 0.0f;
-
-  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
-
-  xmm9  = _mm_setzero_ps();
-  xmm1  = _mm_setzero_ps();
-  xmm0  = _mm_load1_ps(&center_point_array[0]);
-  xmm6  = _mm_load1_ps(&center_point_array[1]);
-  xmm7  = _mm_load1_ps(&center_point_array[2]);
-  xmm8  = _mm_load1_ps(&center_point_array[3]);
-  xmm10 = _mm_load1_ps(cutoff);
-
-  int bound = num_points/8;
-  int leftovers = num_points - 8*bound;
-  int i = 0;
-  for(; i < bound; ++i) {
-    // 1st
-    xmm2 = _mm_load_ps(src0);
-    xmm2 = _mm_max_ps(xmm10, xmm2);
-    xmm3 = _mm_mul_ps(xmm2, xmm2);
-    xmm4 = _mm_mul_ps(xmm2, xmm3);
-    xmm5 = _mm_mul_ps(xmm3, xmm3);
-
-    xmm2 = _mm_mul_ps(xmm2, xmm0);
-    xmm3 = _mm_mul_ps(xmm3, xmm6);
-    xmm4 = _mm_mul_ps(xmm4, xmm7);
-    xmm5 = _mm_mul_ps(xmm5, xmm8);
-
-    xmm2 = _mm_add_ps(xmm2, xmm3);
-    xmm3 = _mm_add_ps(xmm4, xmm5);
-
-    src0 += 4;
-
-    xmm9 = _mm_add_ps(xmm2, xmm9);
-    xmm9 = _mm_add_ps(xmm3, xmm9);
-
-    // 2nd
-    xmm2 = _mm_load_ps(src0);
-    xmm2 = _mm_max_ps(xmm10, xmm2);
-    xmm3 = _mm_mul_ps(xmm2, xmm2);
-    xmm4 = _mm_mul_ps(xmm2, xmm3);
-    xmm5 = _mm_mul_ps(xmm3, xmm3);
-
-    xmm2 = _mm_mul_ps(xmm2, xmm0);
-    xmm3 = _mm_mul_ps(xmm3, xmm6);
-    xmm4 = _mm_mul_ps(xmm4, xmm7);
-    xmm5 = _mm_mul_ps(xmm5, xmm8);
-
-    xmm2 = _mm_add_ps(xmm2, xmm3);
-    xmm3 = _mm_add_ps(xmm4, xmm5);
-
-    src0 += 4;
-
-    xmm1 = _mm_add_ps(xmm2, xmm1);
-    xmm1 = _mm_add_ps(xmm3, xmm1);
-  }
-  xmm2 = _mm_hadd_ps(xmm9, xmm1);
-  xmm3 = _mm_hadd_ps(xmm2, xmm2);
-  xmm4 = _mm_hadd_ps(xmm3, xmm3);
-  _mm_store_ss(&result, xmm4);
-
-  for(i = 0; i < leftovers; ++i) {
-    fst  = *src0++;
-    fst  = MAX(fst, *cutoff);
-    sq   = fst * fst;
-    thrd = fst * sq;
-    frth = sq * sq;
-    result += (center_point_array[0] * fst +
-              center_point_array[1] * sq +
-              center_point_array[2] * thrd +
-              center_point_array[3] * frth);
-  }
-
-  result += (float)(num_points) * center_point_array[4];
-  *target = result;
+    float result = 0.0f;
+    float fst = 0.0f;
+    float sq = 0.0f;
+    float thrd = 0.0f;
+    float frth = 0.0f;
+
+    __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
+
+    xmm9 = _mm_setzero_ps();
+    xmm1 = _mm_setzero_ps();
+    xmm0 = _mm_load1_ps(&center_point_array[0]);
+    xmm6 = _mm_load1_ps(&center_point_array[1]);
+    xmm7 = _mm_load1_ps(&center_point_array[2]);
+    xmm8 = _mm_load1_ps(&center_point_array[3]);
+    xmm10 = _mm_load1_ps(cutoff);
+
+    int bound = num_points / 8;
+    int leftovers = num_points - 8 * bound;
+    int i = 0;
+    for (; i < bound; ++i) {
+        // 1st
+        xmm2 = _mm_load_ps(src0);
+        xmm2 = _mm_max_ps(xmm10, xmm2);
+        xmm3 = _mm_mul_ps(xmm2, xmm2);
+        xmm4 = _mm_mul_ps(xmm2, xmm3);
+        xmm5 = _mm_mul_ps(xmm3, xmm3);
+
+        xmm2 = _mm_mul_ps(xmm2, xmm0);
+        xmm3 = _mm_mul_ps(xmm3, xmm6);
+        xmm4 = _mm_mul_ps(xmm4, xmm7);
+        xmm5 = _mm_mul_ps(xmm5, xmm8);
+
+        xmm2 = _mm_add_ps(xmm2, xmm3);
+        xmm3 = _mm_add_ps(xmm4, xmm5);
+
+        src0 += 4;
+
+        xmm9 = _mm_add_ps(xmm2, xmm9);
+        xmm9 = _mm_add_ps(xmm3, xmm9);
+
+        // 2nd
+        xmm2 = _mm_load_ps(src0);
+        xmm2 = _mm_max_ps(xmm10, xmm2);
+        xmm3 = _mm_mul_ps(xmm2, xmm2);
+        xmm4 = _mm_mul_ps(xmm2, xmm3);
+        xmm5 = _mm_mul_ps(xmm3, xmm3);
+
+        xmm2 = _mm_mul_ps(xmm2, xmm0);
+        xmm3 = _mm_mul_ps(xmm3, xmm6);
+        xmm4 = _mm_mul_ps(xmm4, xmm7);
+        xmm5 = _mm_mul_ps(xmm5, xmm8);
+
+        xmm2 = _mm_add_ps(xmm2, xmm3);
+        xmm3 = _mm_add_ps(xmm4, xmm5);
+
+        src0 += 4;
+
+        xmm1 = _mm_add_ps(xmm2, xmm1);
+        xmm1 = _mm_add_ps(xmm3, xmm1);
+    }
+    xmm2 = _mm_hadd_ps(xmm9, xmm1);
+    xmm3 = _mm_hadd_ps(xmm2, xmm2);
+    xmm4 = _mm_hadd_ps(xmm3, xmm3);
+    _mm_store_ss(&result, xmm4);
+
+    for (i = 0; i < leftovers; ++i) {
+        fst = *src0++;
+        fst = MAX(fst, *cutoff);
+        sq = fst * fst;
+        thrd = fst * sq;
+        frth = sq * sq;
+        result += (center_point_array[0] * fst + center_point_array[1] * sq +
+                   center_point_array[2] * thrd + center_point_array[3] * frth);
+    }
+
+    result += (float)(num_points)*center_point_array[4];
+    *target = result;
  }
  
  
  #endif /*LV_HAVE_SSE3*/
  
  #if LV_HAVE_AVX && LV_HAVE_FMA
-#include<immintrin.h>
+#include <immintrin.h>
  
-static inline void
-volk_32f_x3_sum_of_poly_32f_a_avx2_fma(float* target, float* src0, float* center_point_array,
-                                  float* cutoff, unsigned int num_points)
+static inline void volk_32f_x3_sum_of_poly_32f_a_avx2_fma(float* target,
+                                                          float* src0,
+                                                          float* center_point_array,
+                                                          float* cutoff,
+                                                          unsigned int num_points)
  {
-  const unsigned int eighth_points = num_points / 8;
-  float fst = 0.0;
-  float sq = 0.0;
-  float thrd = 0.0;
-  float frth = 0.0;
-
-  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
-  __m256 target_vec;
-  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
-
-  cpa0 = _mm256_set1_ps(center_point_array[0]);
-  cpa1 = _mm256_set1_ps(center_point_array[1]);
-  cpa2 = _mm256_set1_ps(center_point_array[2]);
-  cpa3 = _mm256_set1_ps(center_point_array[3]);
-  cutoff_vec = _mm256_set1_ps(*cutoff);
-  target_vec = _mm256_setzero_ps();
-
-  unsigned int i;
-
-  for(i = 0; i < eighth_points; ++i) {
-    x_to_1 = _mm256_load_ps(src0);
-    x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
-    x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
-    x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
-    // x^1 * x^3 is slightly faster than x^2 * x^2
-    x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
-
-    x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
-    x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
-
-    x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
-    x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
-    // this is slightly faster than result += (x_to_1 + x_to_3)
-    target_vec = _mm256_add_ps(x_to_1, target_vec);
-    target_vec = _mm256_add_ps(x_to_3, target_vec);
-
-    src0 += 8;
-  }
-
-  // the hadd for vector reduction has very very slight impact @ 50k iters
-  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
-  target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
-  _mm256_store_ps(temp_results, target_vec);
-  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
-
-  for(i = eighth_points*8; i < num_points; ++i) {
-    fst  = *src0++;
-    fst  = MAX(fst, *cutoff);
-    sq   = fst * fst;
-    thrd = fst * sq;
-    frth = sq * sq;
-    *target += (center_point_array[0] * fst +
-                center_point_array[1] * sq +
-                center_point_array[2] * thrd +
-                center_point_array[3] * frth);
-  }
-  *target += (float)(num_points) * center_point_array[4];
+    const unsigned int eighth_points = num_points / 8;
+    float fst = 0.0;
+    float sq = 0.0;
+    float thrd = 0.0;
+    float frth = 0.0;
+
+    __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
+    __m256 target_vec;
+    __m256 x_to_1, x_to_2, x_to_3, x_to_4;
+
+    cpa0 = _mm256_set1_ps(center_point_array[0]);
+    cpa1 = _mm256_set1_ps(center_point_array[1]);
+    cpa2 = _mm256_set1_ps(center_point_array[2]);
+    cpa3 = _mm256_set1_ps(center_point_array[3]);
+    cutoff_vec = _mm256_set1_ps(*cutoff);
+    target_vec = _mm256_setzero_ps();
+
+    unsigned int i;
+
+    for (i = 0; i < eighth_points; ++i) {
+        x_to_1 = _mm256_load_ps(src0);
+        x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
+        x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
+        x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
+        // x^1 * x^3 is slightly faster than x^2 * x^2
+        x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
+
+        x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
+        x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
+
+        x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
+        x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
+        // this is slightly faster than result += (x_to_1 + x_to_3)
+        target_vec = _mm256_add_ps(x_to_1, target_vec);
+        target_vec = _mm256_add_ps(x_to_3, target_vec);
+
+        src0 += 8;
+    }
+
+    // the hadd for vector reduction has very very slight impact @ 50k iters
+    __VOLK_ATTR_ALIGNED(32) float temp_results[8];
+    target_vec = _mm256_hadd_ps(
+        target_vec,
+        target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
+    _mm256_store_ps(temp_results, target_vec);
+    *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
+
+    for (i = eighth_points * 8; i < num_points; ++i) {
+        fst = *src0++;
+        fst = MAX(fst, *cutoff);
+        sq = fst * fst;
+        thrd = fst * sq;
+        frth = sq * sq;
+        *target += (center_point_array[0] * fst + center_point_array[1] * sq +
+                    center_point_array[2] * thrd + center_point_array[3] * frth);
+    }
+    *target += (float)(num_points)*center_point_array[4];
  }
  #endif // LV_HAVE_AVX && LV_HAVE_FMA
  
  #ifdef LV_HAVE_AVX
-#include<immintrin.h>
+#include <immintrin.h>
  
-static inline void
-volk_32f_x3_sum_of_poly_32f_a_avx(float* target, float* src0, float* center_point_array,
-                                  float* cutoff, unsigned int num_points)
+static inline void volk_32f_x3_sum_of_poly_32f_a_avx(float* target,
+                                                     float* src0,
+                                                     float* center_point_array,
+                                                     float* cutoff,
+                                                     unsigned int num_points)
  {
-  const unsigned int eighth_points = num_points / 8;
-  float fst = 0.0;
-  float sq = 0.0;
-  float thrd = 0.0;
-  float frth = 0.0;
-
-  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
-  __m256 target_vec;
-  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
-
-  cpa0 = _mm256_set1_ps(center_point_array[0]);
-  cpa1 = _mm256_set1_ps(center_point_array[1]);
-  cpa2 = _mm256_set1_ps(center_point_array[2]);
-  cpa3 = _mm256_set1_ps(center_point_array[3]);
-  cutoff_vec = _mm256_set1_ps(*cutoff);
-  target_vec = _mm256_setzero_ps();
-
-  unsigned int i;
-
-  for(i = 0; i < eighth_points; ++i) {
-    x_to_1 = _mm256_load_ps(src0);
-    x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
-    x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
-    x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
-    // x^1 * x^3 is slightly faster than x^2 * x^2
-    x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
-
-    x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
-    x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
-    x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
-    x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
-
-    x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
-    x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
-    // this is slightly faster than result += (x_to_1 + x_to_3)
-    target_vec = _mm256_add_ps(x_to_1, target_vec);
-    target_vec = _mm256_add_ps(x_to_3, target_vec);
-
-    src0 += 8;
-  }
-
-  // the hadd for vector reduction has very very slight impact @ 50k iters
-  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
-  target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
-  _mm256_store_ps(temp_results, target_vec);
-  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
-
-  for(i = eighth_points*8; i < num_points; ++i) {
-    fst  = *src0++;
-    fst  = MAX(fst, *cutoff);
-    sq   = fst * fst;
-    thrd = fst * sq;
-    frth = sq * sq;
-    *target += (center_point_array[0] * fst +
-                center_point_array[1] * sq +
-                center_point_array[2] * thrd +
-                center_point_array[3] * frth);
-  }
-  *target += (float)(num_points) * center_point_array[4];
+    const unsigned int eighth_points = num_points / 8;
+    float fst = 0.0;
+    float sq = 0.0;
+    float thrd = 0.0;
+    float frth = 0.0;
+
+    __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
+    __m256 target_vec;
+    __m256 x_to_1, x_to_2, x_to_3, x_to_4;
+
+    cpa0 = _mm256_set1_ps(center_point_array[0]);
+    cpa1 = _mm256_set1_ps(center_point_array[1]);
+    cpa2 = _mm256_set1_ps(center_point_array[2]);
+    cpa3 = _mm256_set1_ps(center_point_array[3]);
+    cutoff_vec = _mm256_set1_ps(*cutoff);
+    target_vec = _mm256_setzero_ps();
+
+    unsigned int i;
+
+    for (i = 0; i < eighth_points; ++i) {
+        x_to_1 = _mm256_load_ps(src0);
+        x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
+        x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
+        x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
+        // x^1 * x^3 is slightly faster than x^2 * x^2
+        x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
+
+        x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
+        x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
+        x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
+        x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
+
+        x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
+        x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
+        // this is slightly faster than result += (x_to_1 + x_to_3)
+        target_vec = _mm256_add_ps(x_to_1, target_vec);
+        target_vec = _mm256_add_ps(x_to_3, target_vec);
+
+        src0 += 8;
+    }
+
+    // the hadd for vector reduction has very very slight impact @ 50k iters
+    __VOLK_ATTR_ALIGNED(32) float temp_results[8];
+    target_vec = _mm256_hadd_ps(
+        target_vec,
+        target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
+    _mm256_store_ps(temp_results, target_vec);
+    *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
+
+    for (i = eighth_points * 8; i < num_points; ++i) {
+        fst = *src0++;
+        fst = MAX(fst, *cutoff);
+        sq = fst * fst;
+        thrd = fst * sq;
+        frth = sq * sq;
+        *target += (center_point_array[0] * fst + center_point_array[1] * sq +
+                    center_point_array[2] * thrd + center_point_array[3] * frth);
+    }
+    *target += (float)(num_points)*center_point_array[4];
  }
  #endif // LV_HAVE_AVX
  
  
-
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src0, float* center_point_array,
-                                    float* cutoff, unsigned int num_points)
+static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target,
+                                                       float* src0,
+                                                       float* center_point_array,
+                                                       float* cutoff,
+                                                       unsigned int num_points)
  {
-  const unsigned int eighth_points = num_points / 8;
-
-  float result[8] = {0.0f,0.0f,0.0f,0.0f, 0.0f,0.0f,0.0f,0.0f};
-  float fst  = 0.0f;
-  float sq   = 0.0f;
-  float thrd = 0.0f;
-  float frth = 0.0f;
-
-  unsigned int i = 0;
-  unsigned int k = 0;
-  for(i = 0; i < eighth_points; ++i) {
-    for(k = 0; k < 8; ++k) {
-      fst  = *src0++;
-      fst  = MAX(fst, *cutoff);
-      sq   = fst * fst;
-      thrd = fst * sq;
-      frth = fst * thrd;
-      result[k] += center_point_array[0] * fst  + center_point_array[1] * sq;
-      result[k] += center_point_array[2] * thrd + center_point_array[3] * frth;
+    const unsigned int eighth_points = num_points / 8;
+
+    float result[8] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+    float fst = 0.0f;
+    float sq = 0.0f;
+    float thrd = 0.0f;
+    float frth = 0.0f;
+
+    unsigned int i = 0;
+    unsigned int k = 0;
+    for (i = 0; i < eighth_points; ++i) {
+        for (k = 0; k < 8; ++k) {
+            fst = *src0++;
+            fst = MAX(fst, *cutoff);
+            sq = fst * fst;
+            thrd = fst * sq;
+            frth = fst * thrd;
+            result[k] += center_point_array[0] * fst + center_point_array[1] * sq;
+            result[k] += center_point_array[2] * thrd + center_point_array[3] * frth;
+        }
      }
-  }
-  for(k = 0; k < 8; k+=2)
-    result[k] = result[k]+result[k+1];
-
-  *target = result[0] + result[2] + result[4] + result[6];
-
-  for(i = eighth_points*8; i < num_points; ++i) {
-    fst  = *src0++;
-    fst  = MAX(fst, *cutoff);
-    sq   = fst * fst;
-    thrd = fst * sq;
-    frth = fst * thrd;
-    *target += (center_point_array[0] * fst +
-                center_point_array[1] * sq +
-                center_point_array[2] * thrd +
-                center_point_array[3] * frth);
-  }
-  *target += (float)(num_points) * center_point_array[4];
+    for (k = 0; k < 8; k += 2)
+        result[k] = result[k] + result[k + 1];
+
+    *target = result[0] + result[2] + result[4] + result[6];
+
+    for (i = eighth_points * 8; i < num_points; ++i) {
+        fst = *src0++;
+        fst = MAX(fst, *cutoff);
+        sq = fst * fst;
+        thrd = fst * sq;
+        frth = fst * thrd;
+        *target += (center_point_array[0] * fst + center_point_array[1] * sq +
+                    center_point_array[2] * thrd + center_point_array[3] * frth);
+    }
+    *target += (float)(num_points)*center_point_array[4];
  }
  
  #endif /*LV_HAVE_GENERIC*/
@@ -372,51 +377,52 @@ volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src0, float* center_po
  #include <arm_neon.h>
  
  static inline void
-volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target, float* __restrict src0,
+volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target,
+                                   float* __restrict src0,
                                     float* __restrict center_point_array,
-                                   float* __restrict cutoff, unsigned int num_points)
+                                   float* __restrict cutoff,
+                                   unsigned int num_points)
  {
-  unsigned int i;
-  float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
-
-  float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
-  float32x2_t cutoff_vector;
-  float32x2x2_t x_low, x_high;
-  float32x4_t x_qvector, c_qvector, cpa_qvector;
-  float accumulator;
-  float res_accumulators[4];
-
-  c_qvector = vld1q_f32( zero );
-  // load the cutoff in to a vector
-  cutoff_vector = vdup_n_f32( *cutoff );
-  // ... center point array
-  cpa_qvector = vld1q_f32( center_point_array );
-
-  for(i=0; i < num_points; ++i) {
-    // load x  (src0)
-    x_to_1 = vdup_n_f32( *src0++ );
-
-    // Get a vector of max(src0, cutoff)
-    x_to_1 = vmax_f32(x_to_1,  cutoff_vector ); // x^1
-    x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2
-    x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3
-    x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4
-    // zip up doubles to interleave
-    x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1]
-    x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3]
-    // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
-    x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
-    // now we finally have [x^4 | x^3 | x^2 | x] !
-
-    c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
-
-  }
-  // there should be better vector reduction techniques
-  vst1q_f32(res_accumulators, c_qvector );
-  accumulator = res_accumulators[0] + res_accumulators[1] +
-          res_accumulators[2] + res_accumulators[3];
-
-  *target = accumulator + (float)num_points * center_point_array[4];
+    unsigned int i;
+    float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
+
+    float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
+    float32x2_t cutoff_vector;
+    float32x2x2_t x_low, x_high;
+    float32x4_t x_qvector, c_qvector, cpa_qvector;
+    float accumulator;
+    float res_accumulators[4];
+
+    c_qvector = vld1q_f32(zero);
+    // load the cutoff in to a vector
+    cutoff_vector = vdup_n_f32(*cutoff);
+    // ... center point array
+    cpa_qvector = vld1q_f32(center_point_array);
+
+    for (i = 0; i < num_points; ++i) {
+        // load x  (src0)
+        x_to_1 = vdup_n_f32(*src0++);
+
+        // Get a vector of max(src0, cutoff)
+        x_to_1 = vmax_f32(x_to_1, cutoff_vector); // x^1
+        x_to_2 = vmul_f32(x_to_1, x_to_1);        // x^2
+        x_to_3 = vmul_f32(x_to_2, x_to_1);        // x^3
+        x_to_4 = vmul_f32(x_to_3, x_to_1);        // x^4
+        // zip up doubles to interleave
+        x_low = vzip_f32(x_to_1, x_to_2);  // [x^2 | x^1 || x^2 | x^1]
+        x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3]
+        // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
+        x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
+        // now we finally have [x^4 | x^3 | x^2 | x] !
+
+        c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
+    }
+    // there should be better vector reduction techniques
+    vst1q_f32(res_accumulators, c_qvector);
+    accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
+                  res_accumulators[3];
+
+    *target = accumulator + (float)num_points * center_point_array[4];
  }
  
  #endif /* LV_HAVE_NEON */
@@ -425,82 +431,82 @@ volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target, float* __restrict s
  #ifdef LV_HAVE_NEON
  
  static inline void
-volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target, float* __restrict src0,
+volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target,
+                                     float* __restrict src0,
                                       float* __restrict center_point_array,
-                                     float* __restrict cutoff, unsigned int num_points)
+                                     float* __restrict cutoff,
+                                     unsigned int num_points)
  {
-  unsigned int i;
-  float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
-
-  float accumulator;
-
-  float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
-  accumulator1_vec = vld1q_f32(zero);
-  accumulator2_vec = vld1q_f32(zero);
-  accumulator3_vec = vld1q_f32(zero);
-  accumulator4_vec = vld1q_f32(zero);
-  float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
-  float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
-
-  // load the cutoff in to a vector
-  cutoff_vector = vdupq_n_f32( *cutoff );
-  // ... center point array
-  cpa_0 = vdupq_n_f32(center_point_array[0]);
-  cpa_1 = vdupq_n_f32(center_point_array[1]);
-  cpa_2 = vdupq_n_f32(center_point_array[2]);
-  cpa_3 = vdupq_n_f32(center_point_array[3]);
-
-  // nathan is not sure why this is slower *and* wrong compared to neonvertfma
-  for(i=0; i < num_points/4; ++i) {
-    // load x
-    x_to_1 = vld1q_f32( src0 );
-
-    // Get a vector of max(src0, cutoff)
-    x_to_1 = vmaxq_f32(x_to_1,  cutoff_vector ); // x^1
-    x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2
-    x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3
-    x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4
-    x_to_1 = vmulq_f32(x_to_1, cpa_0);
-    x_to_2 = vmulq_f32(x_to_2, cpa_1);
-    x_to_3 = vmulq_f32(x_to_3, cpa_2);
-    x_to_4 = vmulq_f32(x_to_4, cpa_3);
-    accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
-    accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
-    accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
-    accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
-
-    src0 += 4;
-  }
-  accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
-  accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
-  accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
-
-  __VOLK_ATTR_ALIGNED(32) float res_accumulators[4];
-  vst1q_f32(res_accumulators, accumulator1_vec );
-  accumulator = res_accumulators[0] + res_accumulators[1] +
-          res_accumulators[2] + res_accumulators[3];
-
-  float fst = 0.0;
-  float sq = 0.0;
-  float thrd = 0.0;
-  float frth = 0.0;
-
-  for(i = 4*num_points/4; i < num_points; ++i) {
-    fst = src0[i];
-    fst = MAX(fst, *cutoff);
-
-    sq = fst * fst;
-    thrd = fst * sq;
-    frth = sq * sq;
-    //fith = sq * thrd;
-
-    accumulator += (center_point_array[0] * fst +
-                    center_point_array[1] * sq +
-                    center_point_array[2] * thrd +
-                    center_point_array[3] * frth); //+
-  }
-
-  *target = accumulator + (float)num_points * center_point_array[4];
+    unsigned int i;
+    float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
+
+    float accumulator;
+
+    float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
+    accumulator1_vec = vld1q_f32(zero);
+    accumulator2_vec = vld1q_f32(zero);
+    accumulator3_vec = vld1q_f32(zero);
+    accumulator4_vec = vld1q_f32(zero);
+    float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
+    float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
+
+    // load the cutoff in to a vector
+    cutoff_vector = vdupq_n_f32(*cutoff);
+    // ... center point array
+    cpa_0 = vdupq_n_f32(center_point_array[0]);
+    cpa_1 = vdupq_n_f32(center_point_array[1]);
+    cpa_2 = vdupq_n_f32(center_point_array[2]);
+    cpa_3 = vdupq_n_f32(center_point_array[3]);
+
+    // nathan is not sure why this is slower *and* wrong compared to neonvertfma
+    for (i = 0; i < num_points / 4; ++i) {
+        // load x
+        x_to_1 = vld1q_f32(src0);
+
+        // Get a vector of max(src0, cutoff)
+        x_to_1 = vmaxq_f32(x_to_1, cutoff_vector); // x^1
+        x_to_2 = vmulq_f32(x_to_1, x_to_1);        // x^2
+        x_to_3 = vmulq_f32(x_to_2, x_to_1);        // x^3
+        x_to_4 = vmulq_f32(x_to_3, x_to_1);        // x^4
+        x_to_1 = vmulq_f32(x_to_1, cpa_0);
+        x_to_2 = vmulq_f32(x_to_2, cpa_1);
+        x_to_3 = vmulq_f32(x_to_3, cpa_2);
+        x_to_4 = vmulq_f32(x_to_4, cpa_3);
+        accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
+        accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
+        accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
+        accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
+
+        src0 += 4;
+    }
+    accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
+    accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
+    accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
+
+    __VOLK_ATTR_ALIGNED(32) float res_accumulators[4];
+    vst1q_f32(res_accumulators, accumulator1_vec);
+    accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
+                  res_accumulators[3];
+
+    float fst = 0.0;
+    float sq = 0.0;
+    float thrd = 0.0;
+    float frth = 0.0;
+
+    for (i = 4 * num_points / 4; i < num_points; ++i) {
+        fst = src0[i];
+        fst = MAX(fst, *cutoff);
+
+        sq = fst * fst;
+        thrd = fst * sq;
+        frth = sq * sq;
+        // fith = sq * thrd;
+
+        accumulator += (center_point_array[0] * fst + center_point_array[1] * sq +
+                        center_point_array[2] * thrd + center_point_array[3] * frth); //+
+    }
+
+    *target = accumulator + (float)num_points * center_point_array[4];
  }
  
  #endif /* LV_HAVE_NEON */
@@ -510,150 +516,154 @@ volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target, float* __restrict
  #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
  #define INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
  
-#include<inttypes.h>
-#include<stdio.h>
-#include<volk/volk_complex.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
  
  #ifndef MAX
-#define MAX(X,Y) ((X) > (Y)?(X):(Y))
+#define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
  #endif
  
  #if LV_HAVE_AVX && LV_HAVE_FMA
-#include<immintrin.h>
+#include <immintrin.h>
  
-static inline void
-volk_32f_x3_sum_of_poly_32f_u_avx_fma(float* target, float* src0, float* center_point_array,
-                                      float* cutoff, unsigned int num_points)
+static inline void volk_32f_x3_sum_of_poly_32f_u_avx_fma(float* target,
+                                                         float* src0,
+                                                         float* center_point_array,
+                                                         float* cutoff,
+                                                         unsigned int num_points)
  {
-  const unsigned int eighth_points = num_points / 8;
-  float fst  = 0.0;
-  float sq   = 0.0;
-  float thrd = 0.0;
-  float frth = 0.0;
-
-  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
-  __m256 target_vec;
-  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
-
-  cpa0 = _mm256_set1_ps(center_point_array[0]);
-  cpa1 = _mm256_set1_ps(center_point_array[1]);
-  cpa2 = _mm256_set1_ps(center_point_array[2]);
-  cpa3 = _mm256_set1_ps(center_point_array[3]);
-  cutoff_vec = _mm256_set1_ps(*cutoff);
-  target_vec = _mm256_setzero_ps();
-
-  unsigned int i;
-
-  for(i = 0; i < eighth_points; ++i) {
-    x_to_1 = _mm256_loadu_ps(src0);
-    x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
-    x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
-    x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
-    // x^1 * x^3 is slightly faster than x^2 * x^2
-    x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
-
-    x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
-    x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
-
-    x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
-    x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
-    // this is slightly faster than result += (x_to_1 + x_to_3)
-    target_vec = _mm256_add_ps(x_to_1, target_vec);
-    target_vec = _mm256_add_ps(x_to_3, target_vec);
-
-    src0 += 8;
-  }
-
-  // the hadd for vector reduction has very very slight impact @ 50k iters
-  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
-  target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
-  _mm256_storeu_ps(temp_results, target_vec);
-  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
-
-  for(i = eighth_points*8; i < num_points; ++i) {
-    fst  = *src0++;
-    fst  = MAX(fst, *cutoff);
-    sq   = fst * fst;
-    thrd = fst * sq;
-    frth = sq * sq;
-    *target += (center_point_array[0] * fst +
-                center_point_array[1] * sq +
-                center_point_array[2] * thrd +
-                center_point_array[3] * frth);
-  }
-
-  *target += (float)(num_points) * center_point_array[4];
+    const unsigned int eighth_points = num_points / 8;
+    float fst = 0.0;
+    float sq = 0.0;
+    float thrd = 0.0;
+    float frth = 0.0;
+
+    __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
+    __m256 target_vec;
+    __m256 x_to_1, x_to_2, x_to_3, x_to_4;
+
+    cpa0 = _mm256_set1_ps(center_point_array[0]);
+    cpa1 = _mm256_set1_ps(center_point_array[1]);
+    cpa2 = _mm256_set1_ps(center_point_array[2]);
+    cpa3 = _mm256_set1_ps(center_point_array[3]);
+    cutoff_vec = _mm256_set1_ps(*cutoff);
+    target_vec = _mm256_setzero_ps();
+
+    unsigned int i;
+
+    for (i = 0; i < eighth_points; ++i) {
+        x_to_1 = _mm256_loadu_ps(src0);
+        x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
+        x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
+        x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
+        // x^1 * x^3 is slightly faster than x^2 * x^2
+        x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
+
+        x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
+        x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
+
+        x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
+        x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
+        // this is slightly faster than result += (x_to_1 + x_to_3)
+        target_vec = _mm256_add_ps(x_to_1, target_vec);
+        target_vec = _mm256_add_ps(x_to_3, target_vec);
+
+        src0 += 8;
+    }
+
+    // the hadd for vector reduction has very very slight impact @ 50k iters
+    __VOLK_ATTR_ALIGNED(32) float temp_results[8];
+    target_vec = _mm256_hadd_ps(
+        target_vec,
+        target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
+    _mm256_storeu_ps(temp_results, target_vec);
+    *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
+
+    for (i = eighth_points * 8; i < num_points; ++i) {
+        fst = *src0++;
+        fst = MAX(fst, *cutoff);
+        sq = fst * fst;
+        thrd = fst * sq;
+        frth = sq * sq;
+        *target += (center_point_array[0] * fst + center_point_array[1] * sq +
+                    center_point_array[2] * thrd + center_point_array[3] * frth);
+    }
+
+    *target += (float)(num_points)*center_point_array[4];
  }
  #endif // LV_HAVE_AVX && LV_HAVE_FMA
  
  #ifdef LV_HAVE_AVX
-#include<immintrin.h>
+#include <immintrin.h>
  
-static inline void
-volk_32f_x3_sum_of_poly_32f_u_avx(float* target, float* src0, float* center_point_array,
-                                  float* cutoff, unsigned int num_points)
+static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target,
+                                                     float* src0,
+                                                     float* center_point_array,
+                                                     float* cutoff,
+                                                     unsigned int num_points)
  {
-  const unsigned int eighth_points = num_points / 8;
-  float fst  = 0.0;
-  float sq   = 0.0;
-  float thrd = 0.0;
-  float frth = 0.0;
-
-  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
-  __m256 target_vec;
-  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
-
-  cpa0 = _mm256_set1_ps(center_point_array[0]);
-  cpa1 = _mm256_set1_ps(center_point_array[1]);
-  cpa2 = _mm256_set1_ps(center_point_array[2]);
-  cpa3 = _mm256_set1_ps(center_point_array[3]);
-  cutoff_vec = _mm256_set1_ps(*cutoff);
-  target_vec = _mm256_setzero_ps();
-
-  unsigned int i;
-
-  for(i = 0; i < eighth_points; ++i) {
-    x_to_1 = _mm256_loadu_ps(src0);
-    x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
-    x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
-    x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
-    // x^1 * x^3 is slightly faster than x^2 * x^2
-    x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
-
-    x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
-    x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
-    x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
-    x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
-
-    x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
-    x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
-    // this is slightly faster than result += (x_to_1 + x_to_3)
-    target_vec = _mm256_add_ps(x_to_1, target_vec);
-    target_vec = _mm256_add_ps(x_to_3, target_vec);
-
-    src0 += 8;
-  }
-
-  // the hadd for vector reduction has very very slight impact @ 50k iters
-  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
-  target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
-  _mm256_storeu_ps(temp_results, target_vec);
-  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
-
-  for(i = eighth_points*8; i < num_points; ++i) {
-    fst  = *src0++;
-    fst  = MAX(fst, *cutoff);
-    sq   = fst * fst;
-    thrd = fst * sq;
-    frth = sq * sq;
-
-    *target += (center_point_array[0] * fst +
-                center_point_array[1] * sq +
-                center_point_array[2] * thrd +
-                center_point_array[3] * frth);
-  }
-
-  *target += (float)(num_points) * center_point_array[4];
+    const unsigned int eighth_points = num_points / 8;
+    float fst = 0.0;
+    float sq = 0.0;
+    float thrd = 0.0;
+    float frth = 0.0;
+
+    __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
+    __m256 target_vec;
+    __m256 x_to_1, x_to_2, x_to_3, x_to_4;
+
+    cpa0 = _mm256_set1_ps(center_point_array[0]);
+    cpa1 = _mm256_set1_ps(center_point_array[1]);
+    cpa2 = _mm256_set1_ps(center_point_array[2]);
+    cpa3 = _mm256_set1_ps(center_point_array[3]);
+    cutoff_vec = _mm256_set1_ps(*cutoff);
+    target_vec = _mm256_setzero_ps();
+
+    unsigned int i;
+
+    for (i = 0; i < eighth_points; ++i) {
+        x_to_1 = _mm256_loadu_ps(src0);
+        x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
+        x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
+        x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
+        // x^1 * x^3 is slightly faster than x^2 * x^2
+        x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
+
+        x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
+        x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
+        x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
+        x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
+
+        x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
+        x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
+        // this is slightly faster than result += (x_to_1 + x_to_3)
+        target_vec = _mm256_add_ps(x_to_1, target_vec);
+        target_vec = _mm256_add_ps(x_to_3, target_vec);
+
+        src0 += 8;
+    }
+
+    // the hadd for vector reduction has very very slight impact @ 50k iters
+    __VOLK_ATTR_ALIGNED(32) float temp_results[8];
+    target_vec = _mm256_hadd_ps(
+        target_vec,
+        target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
+    _mm256_storeu_ps(temp_results, target_vec);
+    *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
+
+    for (i = eighth_points * 8; i < num_points; ++i) {
+        fst = *src0++;
+        fst = MAX(fst, *cutoff);
+        sq = fst * fst;
+        thrd = fst * sq;
+        frth = sq * sq;
+
+        *target += (center_point_array[0] * fst + center_point_array[1] * sq +
+                    center_point_array[2] * thrd + center_point_array[3] * frth);
+    }
+
+    *target += (float)(num_points)*center_point_array[4];
  }
  #endif // LV_HAVE_AVX
  
diff --git a/kernels/volk/volk_32fc_32f_add_32fc.h b/kernels/volk/volk_32fc_32f_add_32fc.h

index 86a3818d6a38ca13eb1a81ee8a8a91c86b24e5ed..b25ca6a012db06184d3fde5baf9c07bad9f6cb43 100644 (file)
--- a/kernels/volk/volk_32fc_32f_add_32fc.h
+++ b/kernels/volk/volk_32fc_32f_add_32fc.h
@@ -31,8 +31,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_32f_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points)
- * \endcode
+ * void volk_32fc_32f_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float*
+ * bVector, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li aVector: First vector of input points.
@@ -44,7 +44,8 @@
   *
   * \b Example
   *
- * The follow example adds the increasing and decreasing vectors such that the result of every summation pair is 10
+ * The follow example adds the increasing and decreasing vectors such that the result of
+ * every summation pair is 10
   *
   * \code
   *   int N = 10;
@@ -75,18 +76,19 @@
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32fc_32f_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                            const float* bVector, unsigned int num_points)
+static inline void volk_32fc_32f_add_32fc_generic(lv_32fc_t* cVector,
+                                                  const lv_32fc_t* aVector,
+                                                  const float* bVector,
+                                                  unsigned int num_points)
  {
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  const float* bPtr=  bVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = (*aPtr++) + (*bPtr++);
-  }
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -94,143 +96,150 @@ volk_32fc_32f_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32fc_32f_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                          const float* bVector, unsigned int num_points)
+static inline void volk_32fc_32f_add_32fc_u_avx(lv_32fc_t* cVector,
+                                                const lv_32fc_t* aVector,
+                                                const float* bVector,
+                                                unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  const float* bPtr=  bVector;
-
-  __m256 aVal1, aVal2, bVal, cVal1, cVal2;
-  __m256 cpx_b1, cpx_b2;
-  __m256 zero;
-  zero = _mm256_setzero_ps();
-  __m256 tmp1, tmp2;
-  for(;number < eighthPoints; number++){
-
-    aVal1 = _mm256_loadu_ps((float *) aPtr);
-    aVal2 = _mm256_loadu_ps((float *) (aPtr+4));
-    bVal = _mm256_loadu_ps(bPtr);
-    cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
-    cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
-
-    tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0+(0x2<<4));
-    tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1+(0x3<<4));
-
-    cVal1 = _mm256_add_ps(aVal1, tmp1);
-    cVal2 = _mm256_add_ps(aVal2, tmp2);
-
-    _mm256_storeu_ps((float *) cPtr, cVal1); // Store the results back into the C container
-    _mm256_storeu_ps((float *) (cPtr+4), cVal2); // Store the results back into the C container
-
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) + (*bPtr++);
-  }
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m256 aVal1, aVal2, bVal, cVal1, cVal2;
+    __m256 cpx_b1, cpx_b2;
+    __m256 zero;
+    zero = _mm256_setzero_ps();
+    __m256 tmp1, tmp2;
+    for (; number < eighthPoints; number++) {
+
+        aVal1 = _mm256_loadu_ps((float*)aPtr);
+        aVal2 = _mm256_loadu_ps((float*)(aPtr + 4));
+        bVal = _mm256_loadu_ps(bPtr);
+        cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
+        cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
+
+        tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0 + (0x2 << 4));
+        tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1 + (0x3 << 4));
+
+        cVal1 = _mm256_add_ps(aVal1, tmp1);
+        cVal2 = _mm256_add_ps(aVal2, tmp2);
+
+        _mm256_storeu_ps((float*)cPtr,
+                         cVal1); // Store the results back into the C container
+        _mm256_storeu_ps((float*)(cPtr + 4),
+                         cVal2); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32fc_32f_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                          const float* bVector, unsigned int num_points)
+static inline void volk_32fc_32f_add_32fc_a_avx(lv_32fc_t* cVector,
+                                                const lv_32fc_t* aVector,
+                                                const float* bVector,
+                                                unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  const float* bPtr=  bVector;
-
-  __m256 aVal1, aVal2, bVal, cVal1, cVal2;
-  __m256 cpx_b1, cpx_b2;
-  __m256 zero;
-  zero = _mm256_setzero_ps();
-  __m256 tmp1, tmp2;
-  for(;number < eighthPoints; number++){
-
-    aVal1 = _mm256_load_ps((float *) aPtr);
-    aVal2 = _mm256_load_ps((float *) (aPtr+4));
-    bVal = _mm256_load_ps(bPtr);
-    cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
-    cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
-
-    tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0+(0x2<<4));
-    tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1+(0x3<<4));
-
-    cVal1 = _mm256_add_ps(aVal1, tmp1);
-    cVal2 = _mm256_add_ps(aVal2, tmp2);
-
-    _mm256_store_ps((float *) cPtr, cVal1); // Store the results back into the C container
-    _mm256_store_ps((float *) (cPtr+4), cVal2); // Store the results back into the C container
-
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) + (*bPtr++);
-  }
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m256 aVal1, aVal2, bVal, cVal1, cVal2;
+    __m256 cpx_b1, cpx_b2;
+    __m256 zero;
+    zero = _mm256_setzero_ps();
+    __m256 tmp1, tmp2;
+    for (; number < eighthPoints; number++) {
+
+        aVal1 = _mm256_load_ps((float*)aPtr);
+        aVal2 = _mm256_load_ps((float*)(aPtr + 4));
+        bVal = _mm256_load_ps(bPtr);
+        cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
+        cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
+
+        tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0 + (0x2 << 4));
+        tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1 + (0x3 << 4));
+
+        cVal1 = _mm256_add_ps(aVal1, tmp1);
+        cVal2 = _mm256_add_ps(aVal2, tmp2);
+
+        _mm256_store_ps((float*)cPtr,
+                        cVal1); // Store the results back into the C container
+        _mm256_store_ps((float*)(cPtr + 4),
+                        cVal2); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_32fc_32f_add_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                           const float* bVector, unsigned int num_points)
+static inline void volk_32fc_32f_add_32fc_neon(lv_32fc_t* cVector,
+                                               const lv_32fc_t* aVector,
+                                               const float* bVector,
+                                               unsigned int num_points)
  {
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  const float* bPtr = bVector;
-
-  float32x4x4_t aVal0, aVal1;
-  float32x4x2_t bVal0, bVal1;
-
-  const unsigned int sixteenthPoints = num_points / 16;
-  unsigned int number = 0;
-  for(; number < sixteenthPoints; number++){
-    aVal0 = vld4q_f32((const float*)aPtr);
-    aPtr += 8;
-    aVal1 = vld4q_f32((const float*)aPtr);
-    aPtr += 8;
-    __VOLK_PREFETCH(aPtr+16);
-
-    bVal0 = vld2q_f32((const float*)bPtr);
-    bPtr += 8;
-    bVal1 = vld2q_f32((const float*)bPtr);
-    bPtr += 8;
-    __VOLK_PREFETCH(bPtr+16);
-
-    aVal0.val[0] = vaddq_f32(aVal0.val[0], bVal0.val[0]);
-    aVal0.val[2] = vaddq_f32(aVal0.val[2], bVal0.val[1]);
-
-    aVal1.val[2] = vaddq_f32(aVal1.val[2], bVal1.val[1]);
-    aVal1.val[0] = vaddq_f32(aVal1.val[0], bVal1.val[0]);
-
-    vst4q_f32((float*)(cPtr), aVal0);
-    cPtr += 8;
-    vst4q_f32((float*)(cPtr), aVal1);
-    cPtr += 8;
-  }
-
-  for(number = sixteenthPoints * 16; number < num_points; number++){
-    *cPtr++ = (*aPtr++) + (*bPtr++);
-  }
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    float32x4x4_t aVal0, aVal1;
+    float32x4x2_t bVal0, bVal1;
+
+    const unsigned int sixteenthPoints = num_points / 16;
+    unsigned int number = 0;
+    for (; number < sixteenthPoints; number++) {
+        aVal0 = vld4q_f32((const float*)aPtr);
+        aPtr += 8;
+        aVal1 = vld4q_f32((const float*)aPtr);
+        aPtr += 8;
+        __VOLK_PREFETCH(aPtr + 16);
+
+        bVal0 = vld2q_f32((const float*)bPtr);
+        bPtr += 8;
+        bVal1 = vld2q_f32((const float*)bPtr);
+        bPtr += 8;
+        __VOLK_PREFETCH(bPtr + 16);
+
+        aVal0.val[0] = vaddq_f32(aVal0.val[0], bVal0.val[0]);
+        aVal0.val[2] = vaddq_f32(aVal0.val[2], bVal0.val[1]);
+
+        aVal1.val[2] = vaddq_f32(aVal1.val[2], bVal1.val[1]);
+        aVal1.val[0] = vaddq_f32(aVal1.val[0], bVal1.val[0]);
+
+        vst4q_f32((float*)(cPtr), aVal0);
+        cPtr += 8;
+        vst4q_f32((float*)(cPtr), aVal1);
+        cPtr += 8;
+    }
+
+    for (number = sixteenthPoints * 16; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_NEON */
  
diff --git a/kernels/volk/volk_32fc_32f_dot_prod_32fc.h b/kernels/volk/volk_32fc_32f_dot_prod_32fc.h

index 35f70777c72d2a9dcf12bce476829c064ba00d85..d9058708682d2288e0c4e1e09840fdbc278ec887 100644 (file)
--- a/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
+++ b/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
@@ -33,8 +33,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_32f_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points)
- * \endcode
+ * void volk_32fc_32f_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const float
+ * * taps, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li input: vector of complex samples
@@ -63,28 +63,32 @@
  #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
  #define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
  
-#include <volk/volk_common.h>
  #include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) {
+static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result,
+                                                       const lv_32fc_t* input,
+                                                       const float* taps,
+                                                       unsigned int num_points)
+{
  
-  float res[2];
-  float *realpt = &res[0], *imagpt = &res[1];
-  const float* aPtr = (float*)input;
-  const float* bPtr=  taps;
-  unsigned int number = 0;
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const float* aPtr = (float*)input;
+    const float* bPtr = taps;
+    unsigned int number = 0;
  
-  *realpt = 0;
-  *imagpt = 0;
+    *realpt = 0;
+    *imagpt = 0;
  
-  for(number = 0; number < num_points; number++){
-    *realpt += ((*aPtr++) * (*bPtr));
-    *imagpt += ((*aPtr++) * (*bPtr++));
-  }
+    for (number = 0; number < num_points; number++) {
+        *realpt += ((*aPtr++) * (*bPtr));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
  
-  *result = *(lv_32fc_t*)(&res[0]);
+    *result = *(lv_32fc_t*)(&res[0]);
  }
  
  #endif /*LV_HAVE_GENERIC*/
@@ -93,78 +97,83 @@ static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result, const
  
  #include <immintrin.h>
  
-static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float res[2];
-  float *realpt = &res[0], *imagpt = &res[1];
-  const float* aPtr = (float*)input;
-  const float* bPtr = taps;
-
-  __m256 a0Val, a1Val, a2Val, a3Val;
-  __m256 b0Val, b1Val, b2Val, b3Val;
-  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
-
-  __m256 dotProdVal0 = _mm256_setzero_ps();
-  __m256 dotProdVal1 = _mm256_setzero_ps();
-  __m256 dotProdVal2 = _mm256_setzero_ps();
-  __m256 dotProdVal3 = _mm256_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    a0Val = _mm256_load_ps(aPtr);
-    a1Val = _mm256_load_ps(aPtr+8);
-    a2Val = _mm256_load_ps(aPtr+16);
-    a3Val = _mm256_load_ps(aPtr+24);
-
-    x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
-    x1Val = _mm256_load_ps(bPtr+8);
-    x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
-    x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
-    x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
-    x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
-
-    // TODO: it may be possible to rearrange swizzling to better pipeline data
-    b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
-    b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
-    b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
-    b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
-
-    dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
-    dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
-    dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
-    dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
-
-    aPtr += 32;
-    bPtr += 16;
-  }
-
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
-  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  *realpt = dotProductVector[0];
-  *imagpt = dotProductVector[1];
-  *realpt += dotProductVector[2];
-  *imagpt += dotProductVector[3];
-  *realpt += dotProductVector[4];
-  *imagpt += dotProductVector[5];
-  *realpt += dotProductVector[6];
-  *imagpt += dotProductVector[7];
-
-  number = sixteenthPoints*16;
-  for(;number < num_points; number++){
-    *realpt += ((*aPtr++) * (*bPtr));
-    *imagpt += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = *(lv_32fc_t*)(&res[0]);
+static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
+                                                          const lv_32fc_t* input,
+                                                          const float* taps,
+                                                          unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const float* aPtr = (float*)input;
+    const float* bPtr = taps;
+
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val, b1Val, b2Val, b3Val;
+    __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 dotProdVal2 = _mm256_setzero_ps();
+    __m256 dotProdVal3 = _mm256_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm256_load_ps(aPtr);
+        a1Val = _mm256_load_ps(aPtr + 8);
+        a2Val = _mm256_load_ps(aPtr + 16);
+        a3Val = _mm256_load_ps(aPtr + 24);
+
+        x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
+        x1Val = _mm256_load_ps(bPtr + 8);
+        x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
+        x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
+        x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
+        x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
+
+        // TODO: it may be possible to rearrange swizzling to better pipeline data
+        b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
+        b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
+        b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
+        b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
+
+        dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
+        dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
+        dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
+        dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
+
+        aPtr += 32;
+        bPtr += 16;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    *realpt = dotProductVector[0];
+    *imagpt = dotProductVector[1];
+    *realpt += dotProductVector[2];
+    *imagpt += dotProductVector[3];
+    *realpt += dotProductVector[4];
+    *imagpt += dotProductVector[5];
+    *realpt += dotProductVector[6];
+    *imagpt += dotProductVector[7];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *realpt += ((*aPtr++) * (*bPtr));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
  }
  
  #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
@@ -173,164 +182,172 @@ static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma( lv_32fc_t* result, co
  
  #include <immintrin.h>
  
-static inline void volk_32fc_32f_dot_prod_32fc_a_avx( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float res[2];
-  float *realpt = &res[0], *imagpt = &res[1];
-  const float* aPtr = (float*)input;
-  const float* bPtr = taps;
-
-  __m256 a0Val, a1Val, a2Val, a3Val;
-  __m256 b0Val, b1Val, b2Val, b3Val;
-  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
-  __m256 c0Val, c1Val, c2Val, c3Val;
-
-  __m256 dotProdVal0 = _mm256_setzero_ps();
-  __m256 dotProdVal1 = _mm256_setzero_ps();
-  __m256 dotProdVal2 = _mm256_setzero_ps();
-  __m256 dotProdVal3 = _mm256_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    a0Val = _mm256_load_ps(aPtr);
-    a1Val = _mm256_load_ps(aPtr+8);
-    a2Val = _mm256_load_ps(aPtr+16);
-    a3Val = _mm256_load_ps(aPtr+24);
-
-    x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
-    x1Val = _mm256_load_ps(bPtr+8);
-    x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
-    x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
-    x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
-    x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
-
-    // TODO: it may be possible to rearrange swizzling to better pipeline data
-    b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
-    b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
-    b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
-    b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
-
-    c0Val = _mm256_mul_ps(a0Val, b0Val);
-    c1Val = _mm256_mul_ps(a1Val, b1Val);
-    c2Val = _mm256_mul_ps(a2Val, b2Val);
-    c3Val = _mm256_mul_ps(a3Val, b3Val);
-
-    dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
-    dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
-    dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
-    dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
-
-    aPtr += 32;
-    bPtr += 16;
-  }
-
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
-  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  *realpt = dotProductVector[0];
-  *imagpt = dotProductVector[1];
-  *realpt += dotProductVector[2];
-  *imagpt += dotProductVector[3];
-  *realpt += dotProductVector[4];
-  *imagpt += dotProductVector[5];
-  *realpt += dotProductVector[6];
-  *imagpt += dotProductVector[7];
-
-  number = sixteenthPoints*16;
-  for(;number < num_points; number++){
-    *realpt += ((*aPtr++) * (*bPtr));
-    *imagpt += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = *(lv_32fc_t*)(&res[0]);
+static inline void volk_32fc_32f_dot_prod_32fc_a_avx(lv_32fc_t* result,
+                                                     const lv_32fc_t* input,
+                                                     const float* taps,
+                                                     unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const float* aPtr = (float*)input;
+    const float* bPtr = taps;
+
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val, b1Val, b2Val, b3Val;
+    __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
+    __m256 c0Val, c1Val, c2Val, c3Val;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 dotProdVal2 = _mm256_setzero_ps();
+    __m256 dotProdVal3 = _mm256_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm256_load_ps(aPtr);
+        a1Val = _mm256_load_ps(aPtr + 8);
+        a2Val = _mm256_load_ps(aPtr + 16);
+        a3Val = _mm256_load_ps(aPtr + 24);
+
+        x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
+        x1Val = _mm256_load_ps(bPtr + 8);
+        x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
+        x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
+        x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
+        x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
+
+        // TODO: it may be possible to rearrange swizzling to better pipeline data
+        b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
+        b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
+        b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
+        b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
+
+        c0Val = _mm256_mul_ps(a0Val, b0Val);
+        c1Val = _mm256_mul_ps(a1Val, b1Val);
+        c2Val = _mm256_mul_ps(a2Val, b2Val);
+        c3Val = _mm256_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 32;
+        bPtr += 16;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    *realpt = dotProductVector[0];
+    *imagpt = dotProductVector[1];
+    *realpt += dotProductVector[2];
+    *imagpt += dotProductVector[3];
+    *realpt += dotProductVector[4];
+    *imagpt += dotProductVector[5];
+    *realpt += dotProductVector[6];
+    *imagpt += dotProductVector[7];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *realpt += ((*aPtr++) * (*bPtr));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
  }
  
  #endif /*LV_HAVE_AVX*/
  
  
-
-
  #ifdef LV_HAVE_SSE
  
  
-static inline void volk_32fc_32f_dot_prod_32fc_a_sse( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 8;
-
-  float res[2];
-  float *realpt = &res[0], *imagpt = &res[1];
-  const float* aPtr = (float*)input;
-  const float* bPtr = taps;
-
-  __m128 a0Val, a1Val, a2Val, a3Val;
-  __m128 b0Val, b1Val, b2Val, b3Val;
-  __m128 x0Val, x1Val, x2Val, x3Val;
-  __m128 c0Val, c1Val, c2Val, c3Val;
-
-  __m128 dotProdVal0 = _mm_setzero_ps();
-  __m128 dotProdVal1 = _mm_setzero_ps();
-  __m128 dotProdVal2 = _mm_setzero_ps();
-  __m128 dotProdVal3 = _mm_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    a0Val = _mm_load_ps(aPtr);
-    a1Val = _mm_load_ps(aPtr+4);
-    a2Val = _mm_load_ps(aPtr+8);
-    a3Val = _mm_load_ps(aPtr+12);
-
-    x0Val = _mm_load_ps(bPtr);
-    x1Val = _mm_load_ps(bPtr);
-    x2Val = _mm_load_ps(bPtr+4);
-    x3Val = _mm_load_ps(bPtr+4);
-    b0Val = _mm_unpacklo_ps(x0Val, x1Val);
-    b1Val = _mm_unpackhi_ps(x0Val, x1Val);
-    b2Val = _mm_unpacklo_ps(x2Val, x3Val);
-    b3Val = _mm_unpackhi_ps(x2Val, x3Val);
-
-    c0Val = _mm_mul_ps(a0Val, b0Val);
-    c1Val = _mm_mul_ps(a1Val, b1Val);
-    c2Val = _mm_mul_ps(a2Val, b2Val);
-    c3Val = _mm_mul_ps(a3Val, b3Val);
-
-    dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
-    dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
-    dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
-    dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
-
-    aPtr += 16;
-    bPtr += 8;
-  }
-
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-
-  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  *realpt = dotProductVector[0];
-  *imagpt = dotProductVector[1];
-  *realpt += dotProductVector[2];
-  *imagpt += dotProductVector[3];
-
-  number = sixteenthPoints*8;
-  for(;number < num_points; number++){
-    *realpt += ((*aPtr++) * (*bPtr));
-    *imagpt += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = *(lv_32fc_t*)(&res[0]);
+static inline void volk_32fc_32f_dot_prod_32fc_a_sse(lv_32fc_t* result,
+                                                     const lv_32fc_t* input,
+                                                     const float* taps,
+                                                     unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 8;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const float* aPtr = (float*)input;
+    const float* bPtr = taps;
+
+    __m128 a0Val, a1Val, a2Val, a3Val;
+    __m128 b0Val, b1Val, b2Val, b3Val;
+    __m128 x0Val, x1Val, x2Val, x3Val;
+    __m128 c0Val, c1Val, c2Val, c3Val;
+
+    __m128 dotProdVal0 = _mm_setzero_ps();
+    __m128 dotProdVal1 = _mm_setzero_ps();
+    __m128 dotProdVal2 = _mm_setzero_ps();
+    __m128 dotProdVal3 = _mm_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm_load_ps(aPtr);
+        a1Val = _mm_load_ps(aPtr + 4);
+        a2Val = _mm_load_ps(aPtr + 8);
+        a3Val = _mm_load_ps(aPtr + 12);
+
+        x0Val = _mm_load_ps(bPtr);
+        x1Val = _mm_load_ps(bPtr);
+        x2Val = _mm_load_ps(bPtr + 4);
+        x3Val = _mm_load_ps(bPtr + 4);
+        b0Val = _mm_unpacklo_ps(x0Val, x1Val);
+        b1Val = _mm_unpackhi_ps(x0Val, x1Val);
+        b2Val = _mm_unpacklo_ps(x2Val, x3Val);
+        b3Val = _mm_unpackhi_ps(x2Val, x3Val);
+
+        c0Val = _mm_mul_ps(a0Val, b0Val);
+        c1Val = _mm_mul_ps(a1Val, b1Val);
+        c2Val = _mm_mul_ps(a2Val, b2Val);
+        c3Val = _mm_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 16;
+        bPtr += 8;
+    }
+
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+    _mm_store_ps(dotProductVector,
+                 dotProdVal0); // Store the results back into the dot product vector
+
+    *realpt = dotProductVector[0];
+    *imagpt = dotProductVector[1];
+    *realpt += dotProductVector[2];
+    *imagpt += dotProductVector[3];
+
+    number = sixteenthPoints * 8;
+    for (; number < num_points; number++) {
+        *realpt += ((*aPtr++) * (*bPtr));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
  }
  
  #endif /*LV_HAVE_SSE*/
@@ -339,78 +356,83 @@ static inline void volk_32fc_32f_dot_prod_32fc_a_sse( lv_32fc_t* result, const
  
  #include <immintrin.h>
  
-static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float res[2];
-  float *realpt = &res[0], *imagpt = &res[1];
-  const float* aPtr = (float*)input;
-  const float* bPtr = taps;
-
-  __m256 a0Val, a1Val, a2Val, a3Val;
-  __m256 b0Val, b1Val, b2Val, b3Val;
-  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
-
-  __m256 dotProdVal0 = _mm256_setzero_ps();
-  __m256 dotProdVal1 = _mm256_setzero_ps();
-  __m256 dotProdVal2 = _mm256_setzero_ps();
-  __m256 dotProdVal3 = _mm256_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    a0Val = _mm256_loadu_ps(aPtr);
-    a1Val = _mm256_loadu_ps(aPtr+8);
-    a2Val = _mm256_loadu_ps(aPtr+16);
-    a3Val = _mm256_loadu_ps(aPtr+24);
-
-    x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
-    x1Val = _mm256_load_ps(bPtr+8);
-    x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
-    x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
-    x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
-    x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
-
-    // TODO: it may be possible to rearrange swizzling to better pipeline data
-    b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
-    b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
-    b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
-    b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
-
-    dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
-    dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
-    dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
-    dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
-
-    aPtr += 32;
-    bPtr += 16;
-  }
-
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
-  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  *realpt = dotProductVector[0];
-  *imagpt = dotProductVector[1];
-  *realpt += dotProductVector[2];
-  *imagpt += dotProductVector[3];
-  *realpt += dotProductVector[4];
-  *imagpt += dotProductVector[5];
-  *realpt += dotProductVector[6];
-  *imagpt += dotProductVector[7];
-
-  number = sixteenthPoints*16;
-  for(;number < num_points; number++){
-    *realpt += ((*aPtr++) * (*bPtr));
-    *imagpt += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = *(lv_32fc_t*)(&res[0]);
+static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
+                                                          const lv_32fc_t* input,
+                                                          const float* taps,
+                                                          unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const float* aPtr = (float*)input;
+    const float* bPtr = taps;
+
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val, b1Val, b2Val, b3Val;
+    __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 dotProdVal2 = _mm256_setzero_ps();
+    __m256 dotProdVal3 = _mm256_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm256_loadu_ps(aPtr);
+        a1Val = _mm256_loadu_ps(aPtr + 8);
+        a2Val = _mm256_loadu_ps(aPtr + 16);
+        a3Val = _mm256_loadu_ps(aPtr + 24);
+
+        x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
+        x1Val = _mm256_load_ps(bPtr + 8);
+        x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
+        x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
+        x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
+        x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
+
+        // TODO: it may be possible to rearrange swizzling to better pipeline data
+        b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
+        b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
+        b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
+        b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
+
+        dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
+        dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
+        dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
+        dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
+
+        aPtr += 32;
+        bPtr += 16;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    *realpt = dotProductVector[0];
+    *imagpt = dotProductVector[1];
+    *realpt += dotProductVector[2];
+    *imagpt += dotProductVector[3];
+    *realpt += dotProductVector[4];
+    *imagpt += dotProductVector[5];
+    *realpt += dotProductVector[6];
+    *imagpt += dotProductVector[7];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *realpt += ((*aPtr++) * (*bPtr));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
  }
  
  #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
@@ -419,162 +441,172 @@ static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma( lv_32fc_t* result, co
  
  #include <immintrin.h>
  
-static inline void volk_32fc_32f_dot_prod_32fc_u_avx( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float res[2];
-  float *realpt = &res[0], *imagpt = &res[1];
-  const float* aPtr = (float*)input;
-  const float* bPtr = taps;
-
-  __m256 a0Val, a1Val, a2Val, a3Val;
-  __m256 b0Val, b1Val, b2Val, b3Val;
-  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
-  __m256 c0Val, c1Val, c2Val, c3Val;
-
-  __m256 dotProdVal0 = _mm256_setzero_ps();
-  __m256 dotProdVal1 = _mm256_setzero_ps();
-  __m256 dotProdVal2 = _mm256_setzero_ps();
-  __m256 dotProdVal3 = _mm256_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    a0Val = _mm256_loadu_ps(aPtr);
-    a1Val = _mm256_loadu_ps(aPtr+8);
-    a2Val = _mm256_loadu_ps(aPtr+16);
-    a3Val = _mm256_loadu_ps(aPtr+24);
-
-    x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
-    x1Val = _mm256_loadu_ps(bPtr+8);
-    x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
-    x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
-    x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
-    x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
-
-    // TODO: it may be possible to rearrange swizzling to better pipeline data
-    b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
-    b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
-    b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
-    b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
-
-    c0Val = _mm256_mul_ps(a0Val, b0Val);
-    c1Val = _mm256_mul_ps(a1Val, b1Val);
-    c2Val = _mm256_mul_ps(a2Val, b2Val);
-    c3Val = _mm256_mul_ps(a3Val, b3Val);
-
-    dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
-    dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
-    dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
-    dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
-
-    aPtr += 32;
-    bPtr += 16;
-  }
-
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
-  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  *realpt = dotProductVector[0];
-  *imagpt = dotProductVector[1];
-  *realpt += dotProductVector[2];
-  *imagpt += dotProductVector[3];
-  *realpt += dotProductVector[4];
-  *imagpt += dotProductVector[5];
-  *realpt += dotProductVector[6];
-  *imagpt += dotProductVector[7];
-
-  number = sixteenthPoints*16;
-  for(;number < num_points; number++){
-    *realpt += ((*aPtr++) * (*bPtr));
-    *imagpt += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = *(lv_32fc_t*)(&res[0]);
+static inline void volk_32fc_32f_dot_prod_32fc_u_avx(lv_32fc_t* result,
+                                                     const lv_32fc_t* input,
+                                                     const float* taps,
+                                                     unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const float* aPtr = (float*)input;
+    const float* bPtr = taps;
+
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val, b1Val, b2Val, b3Val;
+    __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
+    __m256 c0Val, c1Val, c2Val, c3Val;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 dotProdVal2 = _mm256_setzero_ps();
+    __m256 dotProdVal3 = _mm256_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm256_loadu_ps(aPtr);
+        a1Val = _mm256_loadu_ps(aPtr + 8);
+        a2Val = _mm256_loadu_ps(aPtr + 16);
+        a3Val = _mm256_loadu_ps(aPtr + 24);
+
+        x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
+        x1Val = _mm256_loadu_ps(bPtr + 8);
+        x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
+        x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
+        x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
+        x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
+
+        // TODO: it may be possible to rearrange swizzling to better pipeline data
+        b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
+        b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
+        b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
+        b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
+
+        c0Val = _mm256_mul_ps(a0Val, b0Val);
+        c1Val = _mm256_mul_ps(a1Val, b1Val);
+        c2Val = _mm256_mul_ps(a2Val, b2Val);
+        c3Val = _mm256_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 32;
+        bPtr += 16;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    *realpt = dotProductVector[0];
+    *imagpt = dotProductVector[1];
+    *realpt += dotProductVector[2];
+    *imagpt += dotProductVector[3];
+    *realpt += dotProductVector[4];
+    *imagpt += dotProductVector[5];
+    *realpt += dotProductVector[6];
+    *imagpt += dotProductVector[7];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *realpt += ((*aPtr++) * (*bPtr));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
  }
  #endif /*LV_HAVE_AVX*/
  
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void volk_32fc_32f_dot_prod_32fc_neon_unroll ( lv_32fc_t* __restrict result, const  lv_32fc_t* __restrict input, const  float* __restrict taps, unsigned int num_points) {
-
-   unsigned int number;
-   const unsigned int quarterPoints = num_points / 8;
-
-   float res[2];
-   float *realpt = &res[0], *imagpt = &res[1];
-   const float* inputPtr = (float*)input;
-   const float* tapsPtr = taps;
-   float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
-   float accVector_real[4];
-   float accVector_imag[4];
-
-   float32x4x2_t  inputVector0, inputVector1;
-   float32x4_t  tapsVector0, tapsVector1;
-   float32x4_t  tmp_real0, tmp_imag0;
-   float32x4_t  tmp_real1, tmp_imag1;
-   float32x4_t real_accumulator0, imag_accumulator0;
-   float32x4_t real_accumulator1, imag_accumulator1;
-
-   // zero out accumulators
-   // take a *float, return float32x4_t
-   real_accumulator0 = vld1q_f32( zero );
-   imag_accumulator0 = vld1q_f32( zero );
-   real_accumulator1 = vld1q_f32( zero );
-   imag_accumulator1 = vld1q_f32( zero );
-
-   for(number=0 ;number < quarterPoints; number++){
-      // load doublewords and duplicate in to second lane
-      tapsVector0 = vld1q_f32(tapsPtr );
-      tapsVector1 = vld1q_f32(tapsPtr+4 );
-
-      // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
-      inputVector0 = vld2q_f32(inputPtr );
-      inputVector1 = vld2q_f32(inputPtr+8 );
-      // inputVector is now a struct of two vectors, 0th is real, 1st is imag
-
-      tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
-      tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
-
-      tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
-      tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
-
-      real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
-      imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
-
-      real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
-      imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
-
-      tapsPtr += 8;
-      inputPtr += 16;
-   }
-
-   real_accumulator0 = vaddq_f32( real_accumulator0, real_accumulator1);
-   imag_accumulator0 = vaddq_f32( imag_accumulator0, imag_accumulator1);
-   // void vst1q_f32( float32_t * ptr, float32x4_t val);
-   // store results back to a complex (array of 2 floats)
-   vst1q_f32(accVector_real, real_accumulator0);
-   vst1q_f32(accVector_imag, imag_accumulator0);
-   *realpt = accVector_real[0] + accVector_real[1] +
-             accVector_real[2] + accVector_real[3] ;
-
-   *imagpt = accVector_imag[0] + accVector_imag[1] +
-             accVector_imag[2] + accVector_imag[3] ;
-
-  // clean up the remainder
-  for(number=quarterPoints*8; number < num_points; number++){
-    *realpt += ((*inputPtr++) * (*tapsPtr));
-    *imagpt += ((*inputPtr++) * (*tapsPtr++));
-  }
-
-  *result = *(lv_32fc_t*)(&res[0]);
+static inline void
+volk_32fc_32f_dot_prod_32fc_neon_unroll(lv_32fc_t* __restrict result,
+                                        const lv_32fc_t* __restrict input,
+                                        const float* __restrict taps,
+                                        unsigned int num_points)
+{
+
+    unsigned int number;
+    const unsigned int quarterPoints = num_points / 8;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const float* inputPtr = (float*)input;
+    const float* tapsPtr = taps;
+    float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
+    float accVector_real[4];
+    float accVector_imag[4];
+
+    float32x4x2_t inputVector0, inputVector1;
+    float32x4_t tapsVector0, tapsVector1;
+    float32x4_t tmp_real0, tmp_imag0;
+    float32x4_t tmp_real1, tmp_imag1;
+    float32x4_t real_accumulator0, imag_accumulator0;
+    float32x4_t real_accumulator1, imag_accumulator1;
+
+    // zero out accumulators
+    // take a *float, return float32x4_t
+    real_accumulator0 = vld1q_f32(zero);
+    imag_accumulator0 = vld1q_f32(zero);
+    real_accumulator1 = vld1q_f32(zero);
+    imag_accumulator1 = vld1q_f32(zero);
+
+    for (number = 0; number < quarterPoints; number++) {
+        // load doublewords and duplicate in to second lane
+        tapsVector0 = vld1q_f32(tapsPtr);
+        tapsVector1 = vld1q_f32(tapsPtr + 4);
+
+        // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
+        inputVector0 = vld2q_f32(inputPtr);
+        inputVector1 = vld2q_f32(inputPtr + 8);
+        // inputVector is now a struct of two vectors, 0th is real, 1st is imag
+
+        tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
+        tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
+
+        tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
+        tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
+
+        real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
+        imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
+
+        real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
+        imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
+
+        tapsPtr += 8;
+        inputPtr += 16;
+    }
+
+    real_accumulator0 = vaddq_f32(real_accumulator0, real_accumulator1);
+    imag_accumulator0 = vaddq_f32(imag_accumulator0, imag_accumulator1);
+    // void vst1q_f32( float32_t * ptr, float32x4_t val);
+    // store results back to a complex (array of 2 floats)
+    vst1q_f32(accVector_real, real_accumulator0);
+    vst1q_f32(accVector_imag, imag_accumulator0);
+    *realpt =
+        accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3];
+
+    *imagpt =
+        accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3];
+
+    // clean up the remainder
+    for (number = quarterPoints * 8; number < num_points; number++) {
+        *realpt += ((*inputPtr++) * (*tapsPtr));
+        *imagpt += ((*inputPtr++) * (*tapsPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
  }
  
  #endif /*LV_HAVE_NEON*/
@@ -582,154 +614,171 @@ static inline void volk_32fc_32f_dot_prod_32fc_neon_unroll ( lv_32fc_t* __restri
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void volk_32fc_32f_dot_prod_32fc_a_neon ( lv_32fc_t* __restrict result, const  lv_32fc_t* __restrict input, const  float* __restrict taps, unsigned int num_points) {
-
-   unsigned int number;
-   const unsigned int quarterPoints = num_points / 4;
+static inline void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t* __restrict result,
+                                                      const lv_32fc_t* __restrict input,
+                                                      const float* __restrict taps,
+                                                      unsigned int num_points)
+{
  
-   float res[2];
-   float *realpt = &res[0], *imagpt = &res[1];
-   const float* inputPtr = (float*)input;
-   const float* tapsPtr = taps;
-   float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
-   float accVector_real[4];
-   float accVector_imag[4];
+    unsigned int number;
+    const unsigned int quarterPoints = num_points / 4;
  
-   float32x4x2_t  inputVector;
-   float32x4_t  tapsVector;
-   float32x4_t tmp_real, tmp_imag;
-   float32x4_t real_accumulator, imag_accumulator;
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const float* inputPtr = (float*)input;
+    const float* tapsPtr = taps;
+    float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
+    float accVector_real[4];
+    float accVector_imag[4];
  
+    float32x4x2_t inputVector;
+    float32x4_t tapsVector;
+    float32x4_t tmp_real, tmp_imag;
+    float32x4_t real_accumulator, imag_accumulator;
  
-   // zero out accumulators
-   // take a *float, return float32x4_t
-   real_accumulator = vld1q_f32( zero );
-   imag_accumulator = vld1q_f32( zero );
  
-   for(number=0 ;number < quarterPoints; number++){
-      // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) )
-      // load doublewords and duplicate in to second lane
-      tapsVector = vld1q_f32(tapsPtr );
+    // zero out accumulators
+    // take a *float, return float32x4_t
+    real_accumulator = vld1q_f32(zero);
+    imag_accumulator = vld1q_f32(zero);
  
-      // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
-      inputVector = vld2q_f32(inputPtr );
+    for (number = 0; number < quarterPoints; number++) {
+        // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) )
+        // load doublewords and duplicate in to second lane
+        tapsVector = vld1q_f32(tapsPtr);
  
-      tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
-      tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
+        // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
+        inputVector = vld2q_f32(inputPtr);
  
-      real_accumulator = vaddq_f32(real_accumulator, tmp_real);
-      imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
+        tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
+        tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
  
+        real_accumulator = vaddq_f32(real_accumulator, tmp_real);
+        imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
  
-      tapsPtr += 4;
-      inputPtr += 8;
  
-   }
+        tapsPtr += 4;
+        inputPtr += 8;
+    }
  
-   // store results back to a complex (array of 2 floats)
-   vst1q_f32(accVector_real, real_accumulator);
-   vst1q_f32(accVector_imag, imag_accumulator);
-   *realpt = accVector_real[0] + accVector_real[1] +
-             accVector_real[2] + accVector_real[3] ;
+    // store results back to a complex (array of 2 floats)
+    vst1q_f32(accVector_real, real_accumulator);
+    vst1q_f32(accVector_imag, imag_accumulator);
+    *realpt =
+        accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3];
  
-   *imagpt = accVector_imag[0] + accVector_imag[1] +
-             accVector_imag[2] + accVector_imag[3] ;
+    *imagpt =
+        accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3];
  
-  // clean up the remainder
-  for(number=quarterPoints*4; number < num_points; number++){
-    *realpt += ((*inputPtr++) * (*tapsPtr));
-    *imagpt += ((*inputPtr++) * (*tapsPtr++));
-  }
+    // clean up the remainder
+    for (number = quarterPoints * 4; number < num_points; number++) {
+        *realpt += ((*inputPtr++) * (*tapsPtr));
+        *imagpt += ((*inputPtr++) * (*tapsPtr++));
+    }
  
-  *result = *(lv_32fc_t*)(&res[0]);
+    *result = *(lv_32fc_t*)(&res[0]);
  }
  
  #endif /*LV_HAVE_NEON*/
  
  #ifdef LV_HAVE_NEONV7
-extern void volk_32fc_32f_dot_prod_32fc_a_neonasm ( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points);
+extern void volk_32fc_32f_dot_prod_32fc_a_neonasm(lv_32fc_t* result,
+                                                  const lv_32fc_t* input,
+                                                  const float* taps,
+                                                  unsigned int num_points);
  #endif /*LV_HAVE_NEONV7*/
  
  #ifdef LV_HAVE_NEONV7
-extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla ( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points);
+extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla(lv_32fc_t* result,
+                                                      const lv_32fc_t* input,
+                                                      const float* taps,
+                                                      unsigned int num_points);
  #endif /*LV_HAVE_NEONV7*/
  
  #ifdef LV_HAVE_NEONV7
-extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline ( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points);
+extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline(lv_32fc_t* result,
+                                                       const lv_32fc_t* input,
+                                                       const float* taps,
+                                                       unsigned int num_points);
  #endif /*LV_HAVE_NEONV7*/
  
  #ifdef LV_HAVE_SSE
  
-static inline void volk_32fc_32f_dot_prod_32fc_u_sse( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 8;
-
-  float res[2];
-  float *realpt = &res[0], *imagpt = &res[1];
-  const float* aPtr = (float*)input;
-  const float* bPtr = taps;
-
-  __m128 a0Val, a1Val, a2Val, a3Val;
-  __m128 b0Val, b1Val, b2Val, b3Val;
-  __m128 x0Val, x1Val, x2Val, x3Val;
-  __m128 c0Val, c1Val, c2Val, c3Val;
-
-  __m128 dotProdVal0 = _mm_setzero_ps();
-  __m128 dotProdVal1 = _mm_setzero_ps();
-  __m128 dotProdVal2 = _mm_setzero_ps();
-  __m128 dotProdVal3 = _mm_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    a0Val = _mm_loadu_ps(aPtr);
-    a1Val = _mm_loadu_ps(aPtr+4);
-    a2Val = _mm_loadu_ps(aPtr+8);
-    a3Val = _mm_loadu_ps(aPtr+12);
-
-    x0Val = _mm_loadu_ps(bPtr);
-    x1Val = _mm_loadu_ps(bPtr);
-    x2Val = _mm_loadu_ps(bPtr+4);
-    x3Val = _mm_loadu_ps(bPtr+4);
-    b0Val = _mm_unpacklo_ps(x0Val, x1Val);
-    b1Val = _mm_unpackhi_ps(x0Val, x1Val);
-    b2Val = _mm_unpacklo_ps(x2Val, x3Val);
-    b3Val = _mm_unpackhi_ps(x2Val, x3Val);
-
-    c0Val = _mm_mul_ps(a0Val, b0Val);
-    c1Val = _mm_mul_ps(a1Val, b1Val);
-    c2Val = _mm_mul_ps(a2Val, b2Val);
-    c3Val = _mm_mul_ps(a3Val, b3Val);
-
-    dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
-    dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
-    dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
-    dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
-
-    aPtr += 16;
-    bPtr += 8;
-  }
-
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-
-  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  *realpt = dotProductVector[0];
-  *imagpt = dotProductVector[1];
-  *realpt += dotProductVector[2];
-  *imagpt += dotProductVector[3];
-
-  number = sixteenthPoints*8;
-  for(;number < num_points; number++){
-    *realpt += ((*aPtr++) * (*bPtr));
-    *imagpt += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = *(lv_32fc_t*)(&res[0]);
+static inline void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t* result,
+                                                     const lv_32fc_t* input,
+                                                     const float* taps,
+                                                     unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 8;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const float* aPtr = (float*)input;
+    const float* bPtr = taps;
+
+    __m128 a0Val, a1Val, a2Val, a3Val;
+    __m128 b0Val, b1Val, b2Val, b3Val;
+    __m128 x0Val, x1Val, x2Val, x3Val;
+    __m128 c0Val, c1Val, c2Val, c3Val;
+
+    __m128 dotProdVal0 = _mm_setzero_ps();
+    __m128 dotProdVal1 = _mm_setzero_ps();
+    __m128 dotProdVal2 = _mm_setzero_ps();
+    __m128 dotProdVal3 = _mm_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm_loadu_ps(aPtr);
+        a1Val = _mm_loadu_ps(aPtr + 4);
+        a2Val = _mm_loadu_ps(aPtr + 8);
+        a3Val = _mm_loadu_ps(aPtr + 12);
+
+        x0Val = _mm_loadu_ps(bPtr);
+        x1Val = _mm_loadu_ps(bPtr);
+        x2Val = _mm_loadu_ps(bPtr + 4);
+        x3Val = _mm_loadu_ps(bPtr + 4);
+        b0Val = _mm_unpacklo_ps(x0Val, x1Val);
+        b1Val = _mm_unpackhi_ps(x0Val, x1Val);
+        b2Val = _mm_unpacklo_ps(x2Val, x3Val);
+        b3Val = _mm_unpackhi_ps(x2Val, x3Val);
+
+        c0Val = _mm_mul_ps(a0Val, b0Val);
+        c1Val = _mm_mul_ps(a1Val, b1Val);
+        c2Val = _mm_mul_ps(a2Val, b2Val);
+        c3Val = _mm_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 16;
+        bPtr += 8;
+    }
+
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+    _mm_store_ps(dotProductVector,
+                 dotProdVal0); // Store the results back into the dot product vector
+
+    *realpt = dotProductVector[0];
+    *imagpt = dotProductVector[1];
+    *realpt += dotProductVector[2];
+    *imagpt += dotProductVector[3];
+
+    number = sixteenthPoints * 8;
+    for (; number < num_points; number++) {
+        *realpt += ((*aPtr++) * (*bPtr));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
  }
  
  #endif /*LV_HAVE_SSE*/
diff --git a/kernels/volk/volk_32fc_32f_multiply_32fc.h b/kernels/volk/volk_32fc_32f_multiply_32fc.h

index b47883f8c0cab09cb882c6cb03dbcbcd6ac2aebd..196ba9ae8cfef2a054a649e9a4664d7fe3e03238 100644 (file)
--- a/kernels/volk/volk_32fc_32f_multiply_32fc.h
+++ b/kernels/volk/volk_32fc_32f_multiply_32fc.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_32f_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points);
- * \endcode
+ * void volk_32fc_32f_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
+ * float* bVector, unsigned int num_points); \endcode
   *
   * \b Inputs
   * \li aVector: The input vector of complex floats.
@@ -61,52 +61,55 @@
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                  const float* bVector, unsigned int num_points)
+static inline void volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector,
+                                                     const lv_32fc_t* aVector,
+                                                     const float* bVector,
+                                                     unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  const float* bPtr=  bVector;
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2;
+    __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2;
  
-  __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
+    __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
  
-  for(;number < eighthPoints; number++){
+    for (; number < eighthPoints; number++) {
  
-    aVal1 = _mm256_load_ps((float *)aPtr);
-    aPtr += 4;
+        aVal1 = _mm256_load_ps((float*)aPtr);
+        aPtr += 4;
  
-    aVal2 = _mm256_load_ps((float *)aPtr);
-    aPtr += 4;
+        aVal2 = _mm256_load_ps((float*)aPtr);
+        aPtr += 4;
  
-    bVal = _mm256_load_ps(bPtr); // b0|b1|b2|b3|b4|b5|b6|b7
-    bPtr += 8;
+        bVal = _mm256_load_ps(bPtr); // b0|b1|b2|b3|b4|b5|b6|b7
+        bPtr += 8;
  
-    bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00); // b0|b1|b2|b3|b0|b1|b2|b3
-    bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11); // b4|b5|b6|b7|b4|b5|b6|b7
+        bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00); // b0|b1|b2|b3|b0|b1|b2|b3
+        bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11); // b4|b5|b6|b7|b4|b5|b6|b7
  
-    bVal1 = _mm256_permutevar_ps(bVal1, permute_mask); // b0|b0|b1|b1|b2|b2|b3|b3
-    bVal2 = _mm256_permutevar_ps(bVal2, permute_mask); // b4|b4|b5|b5|b6|b6|b7|b7
+        bVal1 = _mm256_permutevar_ps(bVal1, permute_mask); // b0|b0|b1|b1|b2|b2|b3|b3
+        bVal2 = _mm256_permutevar_ps(bVal2, permute_mask); // b4|b4|b5|b5|b6|b6|b7|b7
  
-    cVal1 = _mm256_mul_ps(aVal1, bVal1);
-    cVal2 = _mm256_mul_ps(aVal2, bVal2);
+        cVal1 = _mm256_mul_ps(aVal1, bVal1);
+        cVal2 = _mm256_mul_ps(aVal2, bVal2);
  
-    _mm256_store_ps((float*)cPtr,cVal1); // Store the results back into the C container
-    cPtr += 4;
+        _mm256_store_ps((float*)cPtr,
+                        cVal1); // Store the results back into the C container
+        cPtr += 4;
  
-    _mm256_store_ps((float*)cPtr,cVal2); // Store the results back into the C container
-    cPtr += 4;
-  }
+        _mm256_store_ps((float*)cPtr,
+                        cVal2); // Store the results back into the C container
+        cPtr += 4;
+    }
  
-  number = eighthPoints * 8;
-  for(;number < num_points; ++number){
-    *cPtr++ = (*aPtr++) * (*bPtr++);
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; ++number) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
@@ -114,67 +117,69 @@ volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                  const float* bVector, unsigned int num_points)
+static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector,
+                                                     const lv_32fc_t* aVector,
+                                                     const float* bVector,
+                                                     unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  const float* bPtr=  bVector;
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const float* bPtr = bVector;
  
-  __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
-  for(;number < quarterPoints; number++){
+    __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
+    for (; number < quarterPoints; number++) {
  
-    aVal1 = _mm_load_ps((const float*)aPtr);
-    aPtr += 2;
+        aVal1 = _mm_load_ps((const float*)aPtr);
+        aPtr += 2;
  
-    aVal2 = _mm_load_ps((const float*)aPtr);
-    aPtr += 2;
+        aVal2 = _mm_load_ps((const float*)aPtr);
+        aPtr += 2;
  
-    bVal = _mm_load_ps(bPtr);
-    bPtr += 4;
+        bVal = _mm_load_ps(bPtr);
+        bPtr += 4;
  
-    bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0));
-    bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2));
+        bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1, 1, 0, 0));
+        bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3, 3, 2, 2));
  
-    cVal = _mm_mul_ps(aVal1, bVal1);
+        cVal = _mm_mul_ps(aVal1, bVal1);
  
-    _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container
-    cPtr += 2;
+        _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
+        cPtr += 2;
  
-    cVal = _mm_mul_ps(aVal2, bVal2);
+        cVal = _mm_mul_ps(aVal2, bVal2);
  
-    _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container
+        _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
  
-    cPtr += 2;
-  }
+        cPtr += 2;
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) * (*bPtr);
-    bPtr++;
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr);
+        bPtr++;
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                    const float* bVector, unsigned int num_points)
+static inline void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector,
+                                                       const lv_32fc_t* aVector,
+                                                       const float* bVector,
+                                                       unsigned int num_points)
  {
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  const float* bPtr=  bVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = (*aPtr++) * (*bPtr++);
-  }
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -182,49 +187,52 @@ volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_32fc_32f_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                 const float* bVector, unsigned int num_points)
+static inline void volk_32fc_32f_multiply_32fc_neon(lv_32fc_t* cVector,
+                                                    const lv_32fc_t* aVector,
+                                                    const float* bVector,
+                                                    unsigned int num_points)
  {
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  const float* bPtr=  bVector;
-  unsigned int number = 0;
-  unsigned int quarter_points = num_points / 4;
-
-  float32x4x2_t inputVector, outputVector;
-  float32x4_t tapsVector;
-  for(number = 0; number < quarter_points; number++){
-    inputVector = vld2q_f32((float*)aPtr);
-    tapsVector = vld1q_f32(bPtr);
-
-    outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector);
-    outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector);
-
-    vst2q_f32((float*)cPtr, outputVector);
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
-
-  for(number = quarter_points * 4; number < num_points; number++){
-    *cPtr++ = (*aPtr++) * (*bPtr++);
-  }
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+
+    float32x4x2_t inputVector, outputVector;
+    float32x4_t tapsVector;
+    for (number = 0; number < quarter_points; number++) {
+        inputVector = vld2q_f32((float*)aPtr);
+        tapsVector = vld1q_f32(bPtr);
+
+        outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector);
+        outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector);
+
+        vst2q_f32((float*)cPtr, outputVector);
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_NEON */
  
  
  #ifdef LV_HAVE_ORC
  
-extern void
-volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                       const float* bVector, unsigned int num_points);
+extern void volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector,
+                                                   const lv_32fc_t* aVector,
+                                                   const float* bVector,
+                                                   unsigned int num_points);
  
-static inline void
-volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                  const float* bVector, unsigned int num_points)
+static inline void volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector,
+                                                     const lv_32fc_t* aVector,
+                                                     const float* bVector,
+                                                     unsigned int num_points)
  {
-  volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
+    volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
  }
  
  #endif /* LV_HAVE_GENERIC */
diff --git a/kernels/volk/volk_32fc_conjugate_32fc.h b/kernels/volk/volk_32fc_conjugate_32fc.h

index 6994d0ef5183b84056077f064122a05c9b4eb8fe..9195e3a216e28848220ef7a6f182bfb1a6f4337d 100644 (file)
--- a/kernels/volk/volk_32fc_conjugate_32fc.h
+++ b/kernels/volk/volk_32fc_conjugate_32fc.h
@@ -29,8 +29,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
- * \endcode
+ * void volk_32fc_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned
+ * int num_points) \endcode
   *
   * \b Inputs
   * \li aVector: The input vector of complex floats.
@@ -68,91 +68,94 @@
  #ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H
  #define INCLUDED_volk_32fc_conjugate_32fc_u_H
  
+#include <float.h>
  #include <inttypes.h>
  #include <stdio.h>
  #include <volk/volk_complex.h>
-#include <float.h>
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32fc_conjugate_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
+static inline void volk_32fc_conjugate_32fc_u_avx(lv_32fc_t* cVector,
+                                                  const lv_32fc_t* aVector,
+                                                  unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  __m256 x;
-  lv_32fc_t* c = cVector;
-  const lv_32fc_t* a = aVector;
+    __m256 x;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
  
-  __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+    __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
  
-  for(;number < quarterPoints; number++){
+    for (; number < quarterPoints; number++) {
  
-    x = _mm256_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
+        x = _mm256_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
  
-    x = _mm256_xor_ps(x, conjugator); // conjugate register
+        x = _mm256_xor_ps(x, conjugator); // conjugate register
  
-    _mm256_storeu_ps((float*)c,x); // Store the results back into the C container
+        _mm256_storeu_ps((float*)c, x); // Store the results back into the C container
  
-    a += 4;
-    c += 4;
-  }
+        a += 4;
+        c += 4;
+    }
  
-  number = quarterPoints * 4;
+    number = quarterPoints * 4;
  
-  for(;number < num_points; number++) {
-    *c++ = lv_conj(*a++);
-  }
+    for (; number < num_points; number++) {
+        *c++ = lv_conj(*a++);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_SSE3
  #include <pmmintrin.h>
  
-static inline void
-volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
+static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector,
+                                                   const lv_32fc_t* aVector,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int halfPoints = num_points / 2;
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
  
-  __m128 x;
-  lv_32fc_t* c = cVector;
-  const lv_32fc_t* a = aVector;
+    __m128 x;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
  
-  __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+    __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
  
-  for(;number < halfPoints; number++){
+    for (; number < halfPoints; number++) {
  
-    x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
+        x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
  
-    x = _mm_xor_ps(x, conjugator); // conjugate register
+        x = _mm_xor_ps(x, conjugator); // conjugate register
  
-    _mm_storeu_ps((float*)c,x); // Store the results back into the C container
+        _mm_storeu_ps((float*)c, x); // Store the results back into the C container
  
-    a += 2;
-    c += 2;
-  }
+        a += 2;
+        c += 2;
+    }
  
-  if((num_points % 2) != 0) {
-    *c = lv_conj(*a);
-  }
+    if ((num_points % 2) != 0) {
+        *c = lv_conj(*a);
+    }
  }
  #endif /* LV_HAVE_SSE3 */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
+static inline void volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector,
+                                                    const lv_32fc_t* aVector,
+                                                    unsigned int num_points)
  {
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  unsigned int number = 0;
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    unsigned int number = 0;
  
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = lv_conj(*aPtr++);
-  }
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = lv_conj(*aPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -161,124 +164,128 @@ volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, u
  #ifndef INCLUDED_volk_32fc_conjugate_32fc_a_H
  #define INCLUDED_volk_32fc_conjugate_32fc_a_H
  
+#include <float.h>
  #include <inttypes.h>
  #include <stdio.h>
  #include <volk/volk_complex.h>
-#include <float.h>
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32fc_conjugate_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
+static inline void volk_32fc_conjugate_32fc_a_avx(lv_32fc_t* cVector,
+                                                  const lv_32fc_t* aVector,
+                                                  unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  __m256 x;
-  lv_32fc_t* c = cVector;
-  const lv_32fc_t* a = aVector;
+    __m256 x;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
  
-  __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+    __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
  
-  for(;number < quarterPoints; number++){
+    for (; number < quarterPoints; number++) {
  
-    x = _mm256_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
+        x = _mm256_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
  
-    x = _mm256_xor_ps(x, conjugator); // conjugate register
+        x = _mm256_xor_ps(x, conjugator); // conjugate register
  
-    _mm256_store_ps((float*)c,x); // Store the results back into the C container
+        _mm256_store_ps((float*)c, x); // Store the results back into the C container
  
-    a += 4;
-    c += 4;
-  }
+        a += 4;
+        c += 4;
+    }
  
-  number = quarterPoints * 4;
+    number = quarterPoints * 4;
  
-  for(;number < num_points; number++) {
-    *c++ = lv_conj(*a++);
-  }
+    for (; number < num_points; number++) {
+        *c++ = lv_conj(*a++);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_SSE3
  #include <pmmintrin.h>
  
-static inline void
-volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
+static inline void volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector,
+                                                   const lv_32fc_t* aVector,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int halfPoints = num_points / 2;
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
  
-  __m128 x;
-  lv_32fc_t* c = cVector;
-  const lv_32fc_t* a = aVector;
+    __m128 x;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
  
-  __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+    __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
  
-  for(;number < halfPoints; number++){
+    for (; number < halfPoints; number++) {
  
-    x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
+        x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
  
-    x = _mm_xor_ps(x, conjugator); // conjugate register
+        x = _mm_xor_ps(x, conjugator); // conjugate register
  
-    _mm_store_ps((float*)c,x); // Store the results back into the C container
+        _mm_store_ps((float*)c, x); // Store the results back into the C container
  
-    a += 2;
-    c += 2;
-  }
+        a += 2;
+        c += 2;
+    }
  
-  if((num_points % 2) != 0) {
-    *c = lv_conj(*a);
-  }
+    if ((num_points % 2) != 0) {
+        *c = lv_conj(*a);
+    }
  }
  #endif /* LV_HAVE_SSE3 */
  
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
+static inline void volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector,
+                                                   const lv_32fc_t* aVector,
+                                                   unsigned int num_points)
  {
-  unsigned int number;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number;
+    const unsigned int quarterPoints = num_points / 4;
  
-  float32x4x2_t x;
-  lv_32fc_t* c = cVector;
-  const lv_32fc_t* a = aVector;
+    float32x4x2_t x;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
  
-  for(number=0; number < quarterPoints; number++){
-    __VOLK_PREFETCH(a+4);
-    x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di
+    for (number = 0; number < quarterPoints; number++) {
+        __VOLK_PREFETCH(a + 4);
+        x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di
  
-    // xor the imaginary lane
-    x.val[1] = vnegq_f32( x.val[1]);
+        // xor the imaginary lane
+        x.val[1] = vnegq_f32(x.val[1]);
  
-    vst2q_f32((float*)c,x); // Store the results back into the C container
+        vst2q_f32((float*)c, x); // Store the results back into the C container
  
-    a += 4;
-    c += 4;
-  }
+        a += 4;
+        c += 4;
+    }
  
-  for(number=quarterPoints*4; number < num_points; number++){
-    *c++ = lv_conj(*a++);
-  }
+    for (number = quarterPoints * 4; number < num_points; number++) {
+        *c++ = lv_conj(*a++);
+    }
  }
  #endif /* LV_HAVE_NEON */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
+static inline void volk_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector,
+                                                      const lv_32fc_t* aVector,
+                                                      unsigned int num_points)
  {
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  unsigned int number = 0;
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    unsigned int number = 0;
  
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = lv_conj(*aPtr++);
-  }
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = lv_conj(*aPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
diff --git a/kernels/volk/volk_32fc_convert_16ic.h b/kernels/volk/volk_32fc_convert_16ic.h

index 0ba238386c9168af139b327603f3207bc62c013c..57881585af95e8885fe89dc500259160de3d5ec4 100644 (file)
--- a/kernels/volk/volk_32fc_convert_16ic.h
+++ b/kernels/volk/volk_32fc_convert_16ic.h
@@ -31,8 +31,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_convert_16ic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points);
- * \endcode
+ * void volk_32fc_convert_16ic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector,
+ * unsigned int num_points); \endcode
   *
   * \b Inputs
   * \li inputVector:  The complex 32-bit float input data buffer.
@@ -46,14 +46,16 @@
  #ifndef INCLUDED_volk_32fc_convert_16ic_a_H
  #define INCLUDED_volk_32fc_convert_16ic_a_H
  
+#include "volk/volk_complex.h"
  #include <limits.h>
  #include <math.h>
-#include "volk/volk_complex.h"
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector,
+                                                 const lv_32fc_t* inputVector,
+                                                 unsigned int num_points)
  {
      const unsigned int avx_iters = num_points / 8;
  
@@ -71,44 +73,44 @@ static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector, const
      const __m256 vmax_val = _mm256_set1_ps(max_val);
      unsigned int i;
  
-    for(i = 0; i < avx_iters; i++)
-        {
-            inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
-            inputVectorPtr += 8;
-            inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
-            inputVectorPtr += 8;
-            __VOLK_PREFETCH(inputVectorPtr + 16);
-
-            // Clip
-            ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
-            ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
-
-            intInputVal1 = _mm256_cvtps_epi32(ret1);
-            intInputVal2 = _mm256_cvtps_epi32(ret2);
-
-            intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
-            intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
-
-            _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
-            outputVectorPtr += 16;
-        }
-
-    for(i = avx_iters * 16; i < num_points * 2; i++)
-        {
-            aux = *inputVectorPtr++;
-            if(aux > max_val)
-                aux = max_val;
-            else if(aux < min_val)
-                aux = min_val;
-            *outputVectorPtr++ = (int16_t)rintf(aux);
-        }
+    for (i = 0; i < avx_iters; i++) {
+        inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
+        inputVectorPtr += 8;
+        inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
+        inputVectorPtr += 8;
+        __VOLK_PREFETCH(inputVectorPtr + 16);
+
+        // Clip
+        ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
+        ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
+
+        intInputVal1 = _mm256_cvtps_epi32(ret1);
+        intInputVal2 = _mm256_cvtps_epi32(ret2);
+
+        intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+        intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
+
+        _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 16;
+    }
+
+    for (i = avx_iters * 16; i < num_points * 2; i++) {
+        aux = *inputVectorPtr++;
+        if (aux > max_val)
+            aux = max_val;
+        else if (aux < min_val)
+            aux = min_val;
+        *outputVectorPtr++ = (int16_t)rintf(aux);
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector,
+                                                 const lv_32fc_t* inputVector,
+                                                 unsigned int num_points)
  {
      const unsigned int sse_iters = num_points / 4;
  
@@ -126,34 +128,34 @@ static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const
      const __m128 vmax_val = _mm_set_ps1(max_val);
      unsigned int i;
  
-    for(i = 0; i < sse_iters; i++)
-        {
-            inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
-            inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
-            __VOLK_PREFETCH(inputVectorPtr + 8);
-
-            // Clip
-            ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
-            ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
-
-            intInputVal1 = _mm_cvtps_epi32(ret1);
-            intInputVal2 = _mm_cvtps_epi32(ret2);
-
-            intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-
-            _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
-            outputVectorPtr += 8;
-        }
-
-    for(i = sse_iters * 8; i < num_points * 2; i++)
-        {
-            aux = *inputVectorPtr++;
-            if(aux > max_val)
-                aux = max_val;
-            else if(aux < min_val)
-                aux = min_val;
-            *outputVectorPtr++ = (int16_t)rintf(aux);
-        }
+    for (i = 0; i < sse_iters; i++) {
+        inputVal1 = _mm_load_ps((float*)inputVectorPtr);
+        inputVectorPtr += 4;
+        inputVal2 = _mm_load_ps((float*)inputVectorPtr);
+        inputVectorPtr += 4;
+        __VOLK_PREFETCH(inputVectorPtr + 8);
+
+        // Clip
+        ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+        ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+
+        intInputVal1 = _mm_cvtps_epi32(ret1);
+        intInputVal2 = _mm_cvtps_epi32(ret2);
+
+        intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+        _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 8;
+    }
+
+    for (i = sse_iters * 8; i < num_points * 2; i++) {
+        aux = *inputVectorPtr++;
+        if (aux > max_val)
+            aux = max_val;
+        else if (aux < min_val)
+            aux = min_val;
+        *outputVectorPtr++ = (int16_t)rintf(aux);
+    }
  }
  #endif /* LV_HAVE_SSE2 */
  
@@ -161,13 +163,24 @@ static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const
  #if LV_HAVE_NEONV7
  #include <arm_neon.h>
  
-#define VCVTRQ_S32_F32(res,val)                                         \
-  __VOLK_ASM ("VCVTR.S32.F32 %[r0], %[v0]\n\t" : [r0]"=w"(res[0]) : [v0]"w"(val[0]) : ); \
-  __VOLK_ASM ("VCVTR.S32.F32 %[r1], %[v1]\n\t" : [r1]"=w"(res[1]) : [v1]"w"(val[1]) : ); \
-  __VOLK_ASM ("VCVTR.S32.F32 %[r2], %[v2]\n\t" : [r2]"=w"(res[2]) : [v2]"w"(val[2]) : ); \
-  __VOLK_ASM ("VCVTR.S32.F32 %[r3], %[v3]\n\t" : [r3]"=w"(res[3]) : [v3]"w"(val[3]) : );
-
-static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+#define VCVTRQ_S32_F32(res, val)                \
+    __VOLK_ASM("VCVTR.S32.F32 %[r0], %[v0]\n\t" \
+               : [r0] "=w"(res[0])              \
+               : [v0] "w"(val[0])               \
+               :);                              \
+    __VOLK_ASM("VCVTR.S32.F32 %[r1], %[v1]\n\t" \
+               : [r1] "=w"(res[1])              \
+               : [v1] "w"(val[1])               \
+               :);                              \
+    __VOLK_ASM("VCVTR.S32.F32 %[r2], %[v2]\n\t" \
+               : [r2] "=w"(res[2])              \
+               : [v2] "w"(val[2])               \
+               :);                              \
+    __VOLK_ASM("VCVTR.S32.F32 %[r3], %[v3]\n\t" : [r3] "=w"(res[3]) : [v3] "w"(val[3]) :);
+
+static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
+                                               const lv_32fc_t* inputVector,
+                                               unsigned int num_points)
  {
  
      const unsigned int neon_iters = num_points / 4;
@@ -184,43 +197,41 @@ static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector, const lv
      const float32x4_t max_val = vmovq_n_f32(max_val_f);
      float32x4_t ret1, ret2, a, b;
  
-    int32x4_t toint_a={0,0,0,0};
-    int32x4_t toint_b={0,0,0,0};
+    int32x4_t toint_a = { 0, 0, 0, 0 };
+    int32x4_t toint_b = { 0, 0, 0, 0 };
      int16x4_t intInputVal1, intInputVal2;
      int16x8_t res;
  
-    for(i = 0; i < neon_iters; i++)
-        {
-            a = vld1q_f32((const float32_t*)(inputVectorPtr));
-            inputVectorPtr += 4;
-            b = vld1q_f32((const float32_t*)(inputVectorPtr));
-            inputVectorPtr += 4;
-            __VOLK_PREFETCH(inputVectorPtr + 8);
-
-            ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
-            ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
-
-            // vcvtr takes into account the current rounding mode (as does rintf)
-            VCVTRQ_S32_F32(toint_a, ret1);
-            VCVTRQ_S32_F32(toint_b, ret2);
-
-            intInputVal1 = vqmovn_s32(toint_a);
-            intInputVal2 = vqmovn_s32(toint_b);
-
-            res = vcombine_s16(intInputVal1, intInputVal2);
-            vst1q_s16((int16_t*)outputVectorPtr, res);
-            outputVectorPtr += 8;
-        }
-
-    for(i = neon_iters * 8; i < num_points * 2; i++)
-        {
-            aux = *inputVectorPtr++;
-            if(aux > max_val_f)
-                aux = max_val_f;
-            else if(aux < min_val_f)
-                aux = min_val_f;
-            *outputVectorPtr++ = (int16_t)rintf(aux);
-        }
+    for (i = 0; i < neon_iters; i++) {
+        a = vld1q_f32((const float32_t*)(inputVectorPtr));
+        inputVectorPtr += 4;
+        b = vld1q_f32((const float32_t*)(inputVectorPtr));
+        inputVectorPtr += 4;
+        __VOLK_PREFETCH(inputVectorPtr + 8);
+
+        ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
+        ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
+
+        // vcvtr takes into account the current rounding mode (as does rintf)
+        VCVTRQ_S32_F32(toint_a, ret1);
+        VCVTRQ_S32_F32(toint_b, ret2);
+
+        intInputVal1 = vqmovn_s32(toint_a);
+        intInputVal2 = vqmovn_s32(toint_b);
+
+        res = vcombine_s16(intInputVal1, intInputVal2);
+        vst1q_s16((int16_t*)outputVectorPtr, res);
+        outputVectorPtr += 8;
+    }
+
+    for (i = neon_iters * 8; i < num_points * 2; i++) {
+        aux = *inputVectorPtr++;
+        if (aux > max_val_f)
+            aux = max_val_f;
+        else if (aux < min_val_f)
+            aux = min_val_f;
+        *outputVectorPtr++ = (int16_t)rintf(aux);
+    }
  }
  
  #undef VCVTRQ_S32_F32
@@ -229,7 +240,9 @@ static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector, const lv
  #if LV_HAVE_NEONV8
  #include <arm_neon.h>
  
-static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector,
+                                                 const lv_32fc_t* inputVector,
+                                                 unsigned int num_points)
  {
      const unsigned int neon_iters = num_points / 4;
  
@@ -245,50 +258,49 @@ static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector, const
      const float32x4_t max_val = vmovq_n_f32(max_val_f);
      float32x4_t ret1, ret2, a, b;
  
-    int32x4_t toint_a={0,0,0,0}, toint_b={0,0,0,0};
+    int32x4_t toint_a = { 0, 0, 0, 0 }, toint_b = { 0, 0, 0, 0 };
      int16x4_t intInputVal1, intInputVal2;
      int16x8_t res;
  
-    for(i = 0; i < neon_iters; i++)
-        {
-            a = vld1q_f32((const float32_t*)(inputVectorPtr));
-            inputVectorPtr += 4;
-            b = vld1q_f32((const float32_t*)(inputVectorPtr));
-            inputVectorPtr += 4;
-            __VOLK_PREFETCH(inputVectorPtr + 8);
-
-            ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
-            ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
-
-            // vrndiq takes into account the current rounding mode (as does rintf)
-            toint_a = vcvtq_s32_f32(vrndiq_f32(ret1));
-            toint_b = vcvtq_s32_f32(vrndiq_f32(ret2));
-
-            intInputVal1 = vqmovn_s32(toint_a);
-            intInputVal2 = vqmovn_s32(toint_b);
-
-            res = vcombine_s16(intInputVal1, intInputVal2);
-            vst1q_s16((int16_t*)outputVectorPtr, res);
-            outputVectorPtr += 8;
-        }
-
-    for(i = neon_iters * 8; i < num_points * 2; i++)
-        {
-            aux = *inputVectorPtr++;
-            if(aux > max_val_f)
-                aux = max_val_f;
-            else if(aux < min_val_f)
-                aux = min_val_f;
-            *outputVectorPtr++ = (int16_t)rintf(aux);
-        }
+    for (i = 0; i < neon_iters; i++) {
+        a = vld1q_f32((const float32_t*)(inputVectorPtr));
+        inputVectorPtr += 4;
+        b = vld1q_f32((const float32_t*)(inputVectorPtr));
+        inputVectorPtr += 4;
+        __VOLK_PREFETCH(inputVectorPtr + 8);
+
+        ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
+        ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
+
+        // vrndiq takes into account the current rounding mode (as does rintf)
+        toint_a = vcvtq_s32_f32(vrndiq_f32(ret1));
+        toint_b = vcvtq_s32_f32(vrndiq_f32(ret2));
+
+        intInputVal1 = vqmovn_s32(toint_a);
+        intInputVal2 = vqmovn_s32(toint_b);
+
+        res = vcombine_s16(intInputVal1, intInputVal2);
+        vst1q_s16((int16_t*)outputVectorPtr, res);
+        outputVectorPtr += 8;
+    }
+
+    for (i = neon_iters * 8; i < num_points * 2; i++) {
+        aux = *inputVectorPtr++;
+        if (aux > max_val_f)
+            aux = max_val_f;
+        else if (aux < min_val_f)
+            aux = min_val_f;
+        *outputVectorPtr++ = (int16_t)rintf(aux);
+    }
  }
  #endif /* LV_HAVE_NEONV8 */
  
  
-
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector,
+                                                  const lv_32fc_t* inputVector,
+                                                  unsigned int num_points)
  {
      float* inputVectorPtr = (float*)inputVector;
      int16_t* outputVectorPtr = (int16_t*)outputVector;
@@ -296,15 +308,14 @@ static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const
      const float max_val = (float)SHRT_MAX;
      float aux;
      unsigned int i;
-    for(i = 0; i < num_points * 2; i++)
-        {
-            aux = *inputVectorPtr++;
-            if(aux > max_val)
-                aux = max_val;
-            else if(aux < min_val)
-                aux = min_val;
-           *outputVectorPtr++ = (int16_t)rintf(aux);
-        }
+    for (i = 0; i < num_points * 2; i++) {
+        aux = *inputVectorPtr++;
+        if (aux > max_val)
+            aux = max_val;
+        else if (aux < min_val)
+            aux = min_val;
+        *outputVectorPtr++ = (int16_t)rintf(aux);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -313,15 +324,17 @@ static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const
  #ifndef INCLUDED_volk_32fc_convert_16ic_u_H
  #define INCLUDED_volk_32fc_convert_16ic_u_H
  
+#include "volk/volk_complex.h"
  #include <limits.h>
  #include <math.h>
-#include "volk/volk_complex.h"
  
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector,
+                                                 const lv_32fc_t* inputVector,
+                                                 unsigned int num_points)
  {
      const unsigned int avx_iters = num_points / 8;
  
@@ -339,37 +352,35 @@ static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector, const
      const __m256 vmax_val = _mm256_set1_ps(max_val);
      unsigned int i;
  
-    for(i = 0; i < avx_iters; i++)
-        {
-            inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
-            inputVectorPtr += 8;
-            inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
-            inputVectorPtr += 8;
-            __VOLK_PREFETCH(inputVectorPtr + 16);
-
-            // Clip
-            ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
-            ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
-
-            intInputVal1 = _mm256_cvtps_epi32(ret1);
-            intInputVal2 = _mm256_cvtps_epi32(ret2);
-
-            intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
-            intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
-
-            _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
-            outputVectorPtr += 16;
-        }
-
-    for(i = avx_iters * 16; i < num_points * 2; i++)
-        {
-            aux = *inputVectorPtr++;
-            if(aux > max_val)
-                aux = max_val;
-            else if(aux < min_val)
-                aux = min_val;
-            *outputVectorPtr++ = (int16_t)rintf(aux);
-        }
+    for (i = 0; i < avx_iters; i++) {
+        inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
+        inputVectorPtr += 8;
+        inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
+        inputVectorPtr += 8;
+        __VOLK_PREFETCH(inputVectorPtr + 16);
+
+        // Clip
+        ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
+        ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
+
+        intInputVal1 = _mm256_cvtps_epi32(ret1);
+        intInputVal2 = _mm256_cvtps_epi32(ret2);
+
+        intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+        intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
+
+        _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 16;
+    }
+
+    for (i = avx_iters * 16; i < num_points * 2; i++) {
+        aux = *inputVectorPtr++;
+        if (aux > max_val)
+            aux = max_val;
+        else if (aux < min_val)
+            aux = min_val;
+        *outputVectorPtr++ = (int16_t)rintf(aux);
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -377,7 +388,9 @@ static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector, const
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector,
+                                                 const lv_32fc_t* inputVector,
+                                                 unsigned int num_points)
  {
      const unsigned int sse_iters = num_points / 4;
  
@@ -395,36 +408,34 @@ static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const
      const __m128 vmax_val = _mm_set_ps1(max_val);
  
      unsigned int i;
-    for(i = 0; i < sse_iters; i++)
-        {
-            inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
-            inputVectorPtr += 4;
-            inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
-            inputVectorPtr += 4;
-            __VOLK_PREFETCH(inputVectorPtr + 8);
-
-            // Clip
-            ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
-            ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
-
-            intInputVal1 = _mm_cvtps_epi32(ret1);
-            intInputVal2 = _mm_cvtps_epi32(ret2);
-
-            intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-
-            _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
-            outputVectorPtr += 8;
-        }
-
-    for(i = sse_iters * 8; i < num_points * 2; i++)
-        {
-            aux = *inputVectorPtr++;
-            if(aux > max_val)
-                aux = max_val;
-            else if(aux < min_val)
-                aux = min_val;
-            *outputVectorPtr++ = (int16_t)rintf(aux);
-        }
+    for (i = 0; i < sse_iters; i++) {
+        inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
+        inputVectorPtr += 4;
+        inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
+        inputVectorPtr += 4;
+        __VOLK_PREFETCH(inputVectorPtr + 8);
+
+        // Clip
+        ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+        ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+
+        intInputVal1 = _mm_cvtps_epi32(ret1);
+        intInputVal2 = _mm_cvtps_epi32(ret2);
+
+        intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+        _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 8;
+    }
+
+    for (i = sse_iters * 8; i < num_points * 2; i++) {
+        aux = *inputVectorPtr++;
+        if (aux > max_val)
+            aux = max_val;
+        else if (aux < min_val)
+            aux = min_val;
+        *outputVectorPtr++ = (int16_t)rintf(aux);
+    }
  }
  #endif /* LV_HAVE_SSE2 */
  #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
diff --git a/kernels/volk/volk_32fc_deinterleave_32f_x2.h b/kernels/volk/volk_32fc_deinterleave_32f_x2.h

index 40cd664df26740240f3491df5815058455704c79..1a06c48e454e954c9ebee95f3033f3d24fc0e1c8 100644 (file)
--- a/kernels/volk/volk_32fc_deinterleave_32f_x2.h
+++ b/kernels/volk/volk_32fc_deinterleave_32f_x2.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_32fc_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_32fc_t*
+ * complexVector, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li complexVector: The complex input vector.
@@ -78,86 +78,88 @@
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
-static inline void
-volk_32fc_deinterleave_32f_x2_a_avx(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector,
-                                    unsigned int num_points)
+static inline void volk_32fc_deinterleave_32f_x2_a_avx(float* iBuffer,
+                                                       float* qBuffer,
+                                                       const lv_32fc_t* complexVector,
+                                                       unsigned int num_points)
  {
-  const float* complexVectorPtr = (float*)complexVector;
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-
-  unsigned int number = 0;
-  // Mask for real and imaginary parts
-  const unsigned int eighthPoints = num_points / 8;
-  __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
-  for(;number < eighthPoints; number++){
-    cplxValue1 = _mm256_load_ps(complexVectorPtr);
-    complexVectorPtr += 8;
-
-    cplxValue2 = _mm256_load_ps(complexVectorPtr);
-    complexVectorPtr += 8;
-
-    complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
-    complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
-
-    // Arrange in i1i2i3i4 format
-    iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
-    // Arrange in q1q2q3q4 format
-    qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
-
-    _mm256_store_ps(iBufferPtr, iValue);
-    _mm256_store_ps(qBufferPtr, qValue);
-
-    iBufferPtr += 8;
-    qBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
+    const float* complexVectorPtr = (float*)complexVector;
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+
+    unsigned int number = 0;
+    // Mask for real and imaginary parts
+    const unsigned int eighthPoints = num_points / 8;
+    __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
+    for (; number < eighthPoints; number++) {
+        cplxValue1 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        cplxValue2 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+        complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+
+        // Arrange in i1i2i3i4 format
+        iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
+        // Arrange in q1q2q3q4 format
+        qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
+
+        _mm256_store_ps(iBufferPtr, iValue);
+        _mm256_store_ps(qBufferPtr, qValue);
+
+        iBufferPtr += 8;
+        qBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector,
-                                    unsigned int num_points)
+static inline void volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer,
+                                                       float* qBuffer,
+                                                       const lv_32fc_t* complexVector,
+                                                       unsigned int num_points)
  {
-  const float* complexVectorPtr = (float*)complexVector;
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-  __m128 cplxValue1, cplxValue2, iValue, qValue;
-  for(;number < quarterPoints; number++){
-    cplxValue1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    cplxValue2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-    // Arrange in q1q2q3q4 format
-    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
-    _mm_store_ps(iBufferPtr, iValue);
-    _mm_store_ps(qBufferPtr, qValue);
-
-    iBufferPtr += 4;
-    qBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
+    const float* complexVectorPtr = (float*)complexVector;
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    __m128 cplxValue1, cplxValue2, iValue, qValue;
+    for (; number < quarterPoints; number++) {
+        cplxValue1 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        cplxValue2 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        // Arrange in i1i2i3i4 format
+        iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+        // Arrange in q1q2q3q4 format
+        qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
+
+        _mm_store_ps(iBufferPtr, iValue);
+        _mm_store_ps(qBufferPtr, qValue);
+
+        iBufferPtr += 4;
+        qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_SSE */
  
@@ -165,48 +167,50 @@ volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_32f
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_32fc_deinterleave_32f_x2_neon(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector,
-                                   unsigned int num_points)
+static inline void volk_32fc_deinterleave_32f_x2_neon(float* iBuffer,
+                                                      float* qBuffer,
+                                                      const lv_32fc_t* complexVector,
+                                                      unsigned int num_points)
  {
-  unsigned int number = 0;
-  unsigned int quarter_points = num_points / 4;
-  const float* complexVectorPtr = (float*)complexVector;
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-  float32x4x2_t complexInput;
-
-  for(number = 0; number < quarter_points; number++){
-    complexInput = vld2q_f32(complexVectorPtr);
-    vst1q_f32( iBufferPtr, complexInput.val[0] );
-    vst1q_f32( qBufferPtr, complexInput.val[1] );
-    complexVectorPtr += 8;
-    iBufferPtr += 4;
-    qBufferPtr += 4;
-  }
-
-  for(number = quarter_points*4; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+    const float* complexVectorPtr = (float*)complexVector;
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+    float32x4x2_t complexInput;
+
+    for (number = 0; number < quarter_points; number++) {
+        complexInput = vld2q_f32(complexVectorPtr);
+        vst1q_f32(iBufferPtr, complexInput.val[0]);
+        vst1q_f32(qBufferPtr, complexInput.val[1]);
+        complexVectorPtr += 8;
+        iBufferPtr += 4;
+        qBufferPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_NEON */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32fc_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector,
-                                      unsigned int num_points)
+static inline void volk_32fc_deinterleave_32f_x2_generic(float* iBuffer,
+                                                         float* qBuffer,
+                                                         const lv_32fc_t* complexVector,
+                                                         unsigned int num_points)
  {
-  const float* complexVectorPtr = (float*)complexVector;
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-  unsigned int number;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
+    const float* complexVectorPtr = (float*)complexVector;
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+    unsigned int number;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -221,45 +225,46 @@ volk_32fc_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_3
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
-static inline void
-volk_32fc_deinterleave_32f_x2_u_avx(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector,
-                                    unsigned int num_points)
+static inline void volk_32fc_deinterleave_32f_x2_u_avx(float* iBuffer,
+                                                       float* qBuffer,
+                                                       const lv_32fc_t* complexVector,
+                                                       unsigned int num_points)
  {
-  const float* complexVectorPtr = (float*)complexVector;
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-
-  unsigned int number = 0;
-  // Mask for real and imaginary parts
-  const unsigned int eighthPoints = num_points / 8;
-  __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
-  for(;number < eighthPoints; number++){
-    cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
-    complexVectorPtr += 8;
-
-    cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
-    complexVectorPtr += 8;
-
-    complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
-    complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
-
-    // Arrange in i1i2i3i4 format
-    iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
-    // Arrange in q1q2q3q4 format
-    qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
-
-    _mm256_storeu_ps(iBufferPtr, iValue);
-    _mm256_storeu_ps(qBufferPtr, qValue);
-
-    iBufferPtr += 8;
-    qBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
+    const float* complexVectorPtr = (float*)complexVector;
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+
+    unsigned int number = 0;
+    // Mask for real and imaginary parts
+    const unsigned int eighthPoints = num_points / 8;
+    __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
+    for (; number < eighthPoints; number++) {
+        cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+        complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+
+        // Arrange in i1i2i3i4 format
+        iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
+        // Arrange in q1q2q3q4 format
+        qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
+
+        _mm256_storeu_ps(iBufferPtr, iValue);
+        _mm256_storeu_ps(qBufferPtr, qValue);
+
+        iBufferPtr += 8;
+        qBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX */
  #endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_u_H */
diff --git a/kernels/volk/volk_32fc_deinterleave_64f_x2.h b/kernels/volk/volk_32fc_deinterleave_64f_x2.h

index 3e799cb95813a95d3c3e44b13b5f57dfcbb3fe23..3b69c3cc510b8a74a30a5a3ac1ed8666ebdaafcc 100644 (file)
--- a/kernels/volk/volk_32fc_deinterleave_64f_x2.h
+++ b/kernels/volk/volk_32fc_deinterleave_64f_x2.h
@@ -79,110 +79,113 @@
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32fc_deinterleave_64f_x2_u_avx(double *iBuffer, double *qBuffer,
-                                    const lv_32fc_t *complexVector,
-                                    unsigned int num_points) {
-  unsigned int number = 0;
-
-  const float *complexVectorPtr = (float *)complexVector;
-  double *iBufferPtr = iBuffer;
-  double *qBufferPtr = qBuffer;
-
-  const unsigned int quarterPoints = num_points / 4;
-  __m256 cplxValue;
-  __m128 complexH, complexL, fVal;
-  __m256d dVal;
-
-  for (; number < quarterPoints; number++) {
-
-    cplxValue = _mm256_loadu_ps(complexVectorPtr);
-    complexVectorPtr += 8;
-
-    complexH = _mm256_extractf128_ps(cplxValue, 1);
-    complexL = _mm256_extractf128_ps(cplxValue, 0);
-
-    // Arrange in i1i2i1i2 format
-    fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2, 0, 2, 0));
-    dVal = _mm256_cvtps_pd(fVal);
-    _mm256_storeu_pd(iBufferPtr, dVal);
-
-    // Arrange in q1q2q1q2 format
-    fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3, 1, 3, 1));
-    dVal = _mm256_cvtps_pd(fVal);
-    _mm256_storeu_pd(qBufferPtr, dVal);
-
-    iBufferPtr += 4;
-    qBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for (; number < num_points; number++) {
-    *iBufferPtr++ = *complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
+static inline void volk_32fc_deinterleave_64f_x2_u_avx(double* iBuffer,
+                                                       double* qBuffer,
+                                                       const lv_32fc_t* complexVector,
+                                                       unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    double* qBufferPtr = qBuffer;
+
+    const unsigned int quarterPoints = num_points / 4;
+    __m256 cplxValue;
+    __m128 complexH, complexL, fVal;
+    __m256d dVal;
+
+    for (; number < quarterPoints; number++) {
+
+        cplxValue = _mm256_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        complexH = _mm256_extractf128_ps(cplxValue, 1);
+        complexL = _mm256_extractf128_ps(cplxValue, 0);
+
+        // Arrange in i1i2i1i2 format
+        fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2, 0, 2, 0));
+        dVal = _mm256_cvtps_pd(fVal);
+        _mm256_storeu_pd(iBufferPtr, dVal);
+
+        // Arrange in q1q2q1q2 format
+        fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3, 1, 3, 1));
+        dVal = _mm256_cvtps_pd(fVal);
+        _mm256_storeu_pd(qBufferPtr, dVal);
+
+        iBufferPtr += 4;
+        qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void
-volk_32fc_deinterleave_64f_x2_u_sse2(double *iBuffer, double *qBuffer,
-                                     const lv_32fc_t *complexVector,
-                                     unsigned int num_points) {
-  unsigned int number = 0;
-
-  const float *complexVectorPtr = (float *)complexVector;
-  double *iBufferPtr = iBuffer;
-  double *qBufferPtr = qBuffer;
-
-  const unsigned int halfPoints = num_points / 2;
-  __m128 cplxValue, fVal;
-  __m128d dVal;
-
-  for (; number < halfPoints; number++) {
-
-    cplxValue = _mm_loadu_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    // Arrange in i1i2i1i2 format
-    fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0));
-    dVal = _mm_cvtps_pd(fVal);
-    _mm_storeu_pd(iBufferPtr, dVal);
-
-    // Arrange in q1q2q1q2 format
-    fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3, 1, 3, 1));
-    dVal = _mm_cvtps_pd(fVal);
-    _mm_storeu_pd(qBufferPtr, dVal);
-
-    iBufferPtr += 2;
-    qBufferPtr += 2;
-  }
-
-  number = halfPoints * 2;
-  for (; number < num_points; number++) {
-    *iBufferPtr++ = *complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
+static inline void volk_32fc_deinterleave_64f_x2_u_sse2(double* iBuffer,
+                                                        double* qBuffer,
+                                                        const lv_32fc_t* complexVector,
+                                                        unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    double* qBufferPtr = qBuffer;
+
+    const unsigned int halfPoints = num_points / 2;
+    __m128 cplxValue, fVal;
+    __m128d dVal;
+
+    for (; number < halfPoints; number++) {
+
+        cplxValue = _mm_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        // Arrange in i1i2i1i2 format
+        fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0));
+        dVal = _mm_cvtps_pd(fVal);
+        _mm_storeu_pd(iBufferPtr, dVal);
+
+        // Arrange in q1q2q1q2 format
+        fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3, 1, 3, 1));
+        dVal = _mm_cvtps_pd(fVal);
+        _mm_storeu_pd(qBufferPtr, dVal);
+
+        iBufferPtr += 2;
+        qBufferPtr += 2;
+    }
+
+    number = halfPoints * 2;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32fc_deinterleave_64f_x2_generic(double *iBuffer, double *qBuffer,
-                                      const lv_32fc_t *complexVector,
-                                      unsigned int num_points) {
-  unsigned int number = 0;
-  const float *complexVectorPtr = (float *)complexVector;
-  double *iBufferPtr = iBuffer;
-  double *qBufferPtr = qBuffer;
-
-  for (number = 0; number < num_points; number++) {
-    *iBufferPtr++ = (double)*complexVectorPtr++;
-    *qBufferPtr++ = (double)*complexVectorPtr++;
-  }
+static inline void volk_32fc_deinterleave_64f_x2_generic(double* iBuffer,
+                                                         double* qBuffer,
+                                                         const lv_32fc_t* complexVector,
+                                                         unsigned int num_points)
+{
+    unsigned int number = 0;
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    double* qBufferPtr = qBuffer;
+
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = (double)*complexVectorPtr++;
+        *qBufferPtr++ = (double)*complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -196,146 +199,150 @@ volk_32fc_deinterleave_64f_x2_generic(double *iBuffer, double *qBuffer,
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32fc_deinterleave_64f_x2_a_avx(double *iBuffer, double *qBuffer,
-                                    const lv_32fc_t *complexVector,
-                                    unsigned int num_points) {
-  unsigned int number = 0;
-
-  const float *complexVectorPtr = (float *)complexVector;
-  double *iBufferPtr = iBuffer;
-  double *qBufferPtr = qBuffer;
-
-  const unsigned int quarterPoints = num_points / 4;
-  __m256 cplxValue;
-  __m128 complexH, complexL, fVal;
-  __m256d dVal;
-
-  for (; number < quarterPoints; number++) {
-
-    cplxValue = _mm256_load_ps(complexVectorPtr);
-    complexVectorPtr += 8;
-
-    complexH = _mm256_extractf128_ps(cplxValue, 1);
-    complexL = _mm256_extractf128_ps(cplxValue, 0);
-
-    // Arrange in i1i2i1i2 format
-    fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2, 0, 2, 0));
-    dVal = _mm256_cvtps_pd(fVal);
-    _mm256_store_pd(iBufferPtr, dVal);
-
-    // Arrange in q1q2q1q2 format
-    fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3, 1, 3, 1));
-    dVal = _mm256_cvtps_pd(fVal);
-    _mm256_store_pd(qBufferPtr, dVal);
-
-    iBufferPtr += 4;
-    qBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for (; number < num_points; number++) {
-    *iBufferPtr++ = *complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
+static inline void volk_32fc_deinterleave_64f_x2_a_avx(double* iBuffer,
+                                                       double* qBuffer,
+                                                       const lv_32fc_t* complexVector,
+                                                       unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    double* qBufferPtr = qBuffer;
+
+    const unsigned int quarterPoints = num_points / 4;
+    __m256 cplxValue;
+    __m128 complexH, complexL, fVal;
+    __m256d dVal;
+
+    for (; number < quarterPoints; number++) {
+
+        cplxValue = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        complexH = _mm256_extractf128_ps(cplxValue, 1);
+        complexL = _mm256_extractf128_ps(cplxValue, 0);
+
+        // Arrange in i1i2i1i2 format
+        fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2, 0, 2, 0));
+        dVal = _mm256_cvtps_pd(fVal);
+        _mm256_store_pd(iBufferPtr, dVal);
+
+        // Arrange in q1q2q1q2 format
+        fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3, 1, 3, 1));
+        dVal = _mm256_cvtps_pd(fVal);
+        _mm256_store_pd(qBufferPtr, dVal);
+
+        iBufferPtr += 4;
+        qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void
-volk_32fc_deinterleave_64f_x2_a_sse2(double *iBuffer, double *qBuffer,
-                                     const lv_32fc_t *complexVector,
-                                     unsigned int num_points) {
-  unsigned int number = 0;
-
-  const float *complexVectorPtr = (float *)complexVector;
-  double *iBufferPtr = iBuffer;
-  double *qBufferPtr = qBuffer;
-
-  const unsigned int halfPoints = num_points / 2;
-  __m128 cplxValue, fVal;
-  __m128d dVal;
-
-  for (; number < halfPoints; number++) {
-
-    cplxValue = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    // Arrange in i1i2i1i2 format
-    fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0));
-    dVal = _mm_cvtps_pd(fVal);
-    _mm_store_pd(iBufferPtr, dVal);
-
-    // Arrange in q1q2q1q2 format
-    fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3, 1, 3, 1));
-    dVal = _mm_cvtps_pd(fVal);
-    _mm_store_pd(qBufferPtr, dVal);
-
-    iBufferPtr += 2;
-    qBufferPtr += 2;
-  }
-
-  number = halfPoints * 2;
-  for (; number < num_points; number++) {
-    *iBufferPtr++ = *complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
+static inline void volk_32fc_deinterleave_64f_x2_a_sse2(double* iBuffer,
+                                                        double* qBuffer,
+                                                        const lv_32fc_t* complexVector,
+                                                        unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    double* qBufferPtr = qBuffer;
+
+    const unsigned int halfPoints = num_points / 2;
+    __m128 cplxValue, fVal;
+    __m128d dVal;
+
+    for (; number < halfPoints; number++) {
+
+        cplxValue = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        // Arrange in i1i2i1i2 format
+        fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0));
+        dVal = _mm_cvtps_pd(fVal);
+        _mm_store_pd(iBufferPtr, dVal);
+
+        // Arrange in q1q2q1q2 format
+        fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3, 1, 3, 1));
+        dVal = _mm_cvtps_pd(fVal);
+        _mm_store_pd(qBufferPtr, dVal);
+
+        iBufferPtr += 2;
+        qBufferPtr += 2;
+    }
+
+    number = halfPoints * 2;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32fc_deinterleave_64f_x2_a_generic(double *iBuffer, double *qBuffer,
-                                        const lv_32fc_t *complexVector,
-                                        unsigned int num_points) {
-  unsigned int number = 0;
-  const float *complexVectorPtr = (float *)complexVector;
-  double *iBufferPtr = iBuffer;
-  double *qBufferPtr = qBuffer;
-
-  for (number = 0; number < num_points; number++) {
-    *iBufferPtr++ = (double)*complexVectorPtr++;
-    *qBufferPtr++ = (double)*complexVectorPtr++;
-  }
+static inline void volk_32fc_deinterleave_64f_x2_a_generic(double* iBuffer,
+                                                           double* qBuffer,
+                                                           const lv_32fc_t* complexVector,
+                                                           unsigned int num_points)
+{
+    unsigned int number = 0;
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    double* qBufferPtr = qBuffer;
+
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = (double)*complexVectorPtr++;
+        *qBufferPtr++ = (double)*complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  #ifdef LV_HAVE_NEONV8
  #include <arm_neon.h>
  
-static inline void
-volk_32fc_deinterleave_64f_x2_neon(double *iBuffer, double *qBuffer,
-                                   const lv_32fc_t *complexVector,
-                                   unsigned int num_points) {
-  unsigned int number = 0;
-  unsigned int half_points = num_points / 2;
-  const float *complexVectorPtr = (float *)complexVector;
-  double *iBufferPtr = iBuffer;
-  double *qBufferPtr = qBuffer;
-  float32x2x2_t complexInput;
-  float64x2_t iVal, qVal;
-
-  for (number = 0; number < half_points; number++) {
-    complexInput = vld2_f32(complexVectorPtr);
-
-    iVal = vcvt_f64_f32(complexInput.val[0]);
-    qVal = vcvt_f64_f32(complexInput.val[1]);
-
-    vst1q_f64(iBufferPtr, iVal);
-    vst1q_f64(qBufferPtr, qVal);
-
-    complexVectorPtr += 4;
-    iBufferPtr += 2;
-    qBufferPtr += 2;
-  }
-
-  for (number = half_points * 2; number < num_points; number++) {
-    *iBufferPtr++ = (double)*complexVectorPtr++;
-    *qBufferPtr++ = (double)*complexVectorPtr++;
-  }
+static inline void volk_32fc_deinterleave_64f_x2_neon(double* iBuffer,
+                                                      double* qBuffer,
+                                                      const lv_32fc_t* complexVector,
+                                                      unsigned int num_points)
+{
+    unsigned int number = 0;
+    unsigned int half_points = num_points / 2;
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    double* qBufferPtr = qBuffer;
+    float32x2x2_t complexInput;
+    float64x2_t iVal, qVal;
+
+    for (number = 0; number < half_points; number++) {
+        complexInput = vld2_f32(complexVectorPtr);
+
+        iVal = vcvt_f64_f32(complexInput.val[0]);
+        qVal = vcvt_f64_f32(complexInput.val[1]);
+
+        vst1q_f64(iBufferPtr, iVal);
+        vst1q_f64(qBufferPtr, qVal);
+
+        complexVectorPtr += 4;
+        iBufferPtr += 2;
+        qBufferPtr += 2;
+    }
+
+    for (number = half_points * 2; number < num_points; number++) {
+        *iBufferPtr++ = (double)*complexVectorPtr++;
+        *qBufferPtr++ = (double)*complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_NEONV8 */
  
diff --git a/kernels/volk/volk_32fc_deinterleave_imag_32f.h b/kernels/volk/volk_32fc_deinterleave_imag_32f.h

index 13f97644d16f39a7c0d770e154c9375ce8adcac9..e3dfa124f05be0bcbb33cdf8d1a8797ab860e77b 100644 (file)
--- a/kernels/volk/volk_32fc_deinterleave_imag_32f.h
+++ b/kernels/volk/volk_32fc_deinterleave_imag_32f.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_deinterleave_image_32f(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_32fc_deinterleave_image_32f(float* qBuffer, const lv_32fc_t* complexVector,
+ * unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li complexVector: The complex input vector.
@@ -76,121 +76,121 @@
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32fc_deinterleave_imag_32f_a_avx(float* qBuffer, const lv_32fc_t* complexVector,
-                                      unsigned int num_points)
+static inline void volk_32fc_deinterleave_imag_32f_a_avx(float* qBuffer,
+                                                         const lv_32fc_t* complexVector,
+                                                         unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-  const float* complexVectorPtr = (const float*)complexVector;
-  float* qBufferPtr = qBuffer;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+    const float* complexVectorPtr = (const float*)complexVector;
+    float* qBufferPtr = qBuffer;
  
-  __m256 cplxValue1, cplxValue2, complex1, complex2, qValue;
-  for(;number < eighthPoints; number++){
+    __m256 cplxValue1, cplxValue2, complex1, complex2, qValue;
+    for (; number < eighthPoints; number++) {
  
-    cplxValue1 = _mm256_load_ps(complexVectorPtr);
-    complexVectorPtr += 8;
+        cplxValue1 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
  
-    cplxValue2 = _mm256_load_ps(complexVectorPtr);
-    complexVectorPtr += 8;
+        cplxValue2 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
  
-    complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
-    complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+        complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+        complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
  
-    // Arrange in q1q2q3q4 format
-    qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
+        // Arrange in q1q2q3q4 format
+        qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
  
-    _mm256_store_ps(qBufferPtr, qValue);
+        _mm256_store_ps(qBufferPtr, qValue);
  
-    qBufferPtr += 8;
-  }
+        qBufferPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, const lv_32fc_t* complexVector,
-                                      unsigned int num_points)
+static inline void volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer,
+                                                         const lv_32fc_t* complexVector,
+                                                         unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  const float* complexVectorPtr = (const float*)complexVector;
-  float* qBufferPtr = qBuffer;
+    const float* complexVectorPtr = (const float*)complexVector;
+    float* qBufferPtr = qBuffer;
  
-  __m128 cplxValue1, cplxValue2, iValue;
-  for(;number < quarterPoints; number++){
+    __m128 cplxValue1, cplxValue2, iValue;
+    for (; number < quarterPoints; number++) {
  
-    cplxValue1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
+        cplxValue1 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
  
-    cplxValue2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
+        cplxValue2 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
  
-    // Arrange in q1q2q3q4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+        // Arrange in q1q2q3q4 format
+        iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
  
-    _mm_store_ps(qBufferPtr, iValue);
+        _mm_store_ps(qBufferPtr, iValue);
  
-    qBufferPtr += 4;
-  }
+        qBufferPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_32fc_deinterleave_imag_32f_neon(float* qBuffer, const lv_32fc_t* complexVector,
-                                     unsigned int num_points)
+static inline void volk_32fc_deinterleave_imag_32f_neon(float* qBuffer,
+                                                        const lv_32fc_t* complexVector,
+                                                        unsigned int num_points)
  {
-  unsigned int number = 0;
-  unsigned int quarter_points = num_points / 4;
-  const float* complexVectorPtr = (float*)complexVector;
-  float* qBufferPtr = qBuffer;
-  float32x4x2_t complexInput;
-
-  for(number = 0; number < quarter_points; number++){
-    complexInput = vld2q_f32(complexVectorPtr);
-    vst1q_f32( qBufferPtr, complexInput.val[1] );
-    complexVectorPtr += 8;
-    qBufferPtr += 4;
-  }
-
-  for(number = quarter_points*4; number < num_points; number++){
-    complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+    const float* complexVectorPtr = (float*)complexVector;
+    float* qBufferPtr = qBuffer;
+    float32x4x2_t complexInput;
+
+    for (number = 0; number < quarter_points; number++) {
+        complexInput = vld2q_f32(complexVectorPtr);
+        vst1q_f32(qBufferPtr, complexInput.val[1]);
+        complexVectorPtr += 8;
+        qBufferPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_NEON */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32fc_deinterleave_imag_32f_generic(float* qBuffer, const lv_32fc_t* complexVector,
-                                        unsigned int num_points)
+static inline void volk_32fc_deinterleave_imag_32f_generic(float* qBuffer,
+                                                           const lv_32fc_t* complexVector,
+                                                           unsigned int num_points)
  {
-  unsigned int number = 0;
-  const float* complexVectorPtr = (float*)complexVector;
-  float* qBufferPtr = qBuffer;
-  for(number = 0; number < num_points; number++){
-    complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
+    unsigned int number = 0;
+    const float* complexVectorPtr = (float*)complexVector;
+    float* qBufferPtr = qBuffer;
+    for (number = 0; number < num_points; number++) {
+        complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -206,40 +206,40 @@ volk_32fc_deinterleave_imag_32f_generic(float* qBuffer, const lv_32fc_t* complex
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32fc_deinterleave_imag_32f_u_avx(float* qBuffer, const lv_32fc_t* complexVector,
-                                      unsigned int num_points)
+static inline void volk_32fc_deinterleave_imag_32f_u_avx(float* qBuffer,
+                                                         const lv_32fc_t* complexVector,
+                                                         unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-  const float* complexVectorPtr = (const float*)complexVector;
-  float* qBufferPtr = qBuffer;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+    const float* complexVectorPtr = (const float*)complexVector;
+    float* qBufferPtr = qBuffer;
  
-  __m256 cplxValue1, cplxValue2, complex1, complex2, qValue;
-  for(;number < eighthPoints; number++){
+    __m256 cplxValue1, cplxValue2, complex1, complex2, qValue;
+    for (; number < eighthPoints; number++) {
  
-    cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
-    complexVectorPtr += 8;
+        cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 8;
  
-    cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
-    complexVectorPtr += 8;
+        cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 8;
  
-    complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
-    complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+        complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+        complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
  
-    // Arrange in q1q2q3q4 format
-    qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
+        // Arrange in q1q2q3q4 format
+        qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
  
-    _mm256_storeu_ps(qBufferPtr, qValue);
+        _mm256_storeu_ps(qBufferPtr, qValue);
  
-    qBufferPtr += 8;
-  }
+        qBufferPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX */
  #endif /* INCLUDED_volk_32fc_deinterleave_imag_32f_u_H */
diff --git a/kernels/volk/volk_32fc_deinterleave_real_32f.h b/kernels/volk/volk_32fc_deinterleave_real_32f.h

index 92a94d36cab88211d24b700f73648db05ee59de5..2526a1684417143fbd02e7450b68598db0a5d6d4 100644 (file)
--- a/kernels/volk/volk_32fc_deinterleave_real_32f.h
+++ b/kernels/volk/volk_32fc_deinterleave_real_32f.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_deinterleave_real_32f(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_32fc_deinterleave_real_32f(float* iBuffer, const lv_32fc_t* complexVector,
+ * unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li complexVector: The complex input vector.
@@ -76,96 +76,96 @@
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_32fc_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_32fc_t* complexVector,
-                                      unsigned int num_points)
+static inline void volk_32fc_deinterleave_real_32f_a_avx2(float* iBuffer,
+                                                          const lv_32fc_t* complexVector,
+                                                          unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  const float* complexVectorPtr = (const float*)complexVector;
-  float* iBufferPtr = iBuffer;
+    const float* complexVectorPtr = (const float*)complexVector;
+    float* iBufferPtr = iBuffer;
  
-  __m256 cplxValue1, cplxValue2;
-  __m256 iValue;
-  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
-  for(;number < eighthPoints; number++){
+    __m256 cplxValue1, cplxValue2;
+    __m256 iValue;
+    __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+    for (; number < eighthPoints; number++) {
  
-    cplxValue1 = _mm256_load_ps(complexVectorPtr);
-    complexVectorPtr += 8;
+        cplxValue1 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
  
-    cplxValue2 = _mm256_load_ps(complexVectorPtr);
-    complexVectorPtr += 8;
+        cplxValue2 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
  
-    // Arrange in i1i2i3i4 format
-    iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-    iValue = _mm256_permutevar8x32_ps(iValue,idx);
+        // Arrange in i1i2i3i4 format
+        iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+        iValue = _mm256_permutevar8x32_ps(iValue, idx);
  
-    _mm256_store_ps(iBufferPtr, iValue);
+        _mm256_store_ps(iBufferPtr, iValue);
  
-    iBufferPtr += 8;
-  }
+        iBufferPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer, const lv_32fc_t* complexVector,
-                                      unsigned int num_points)
+static inline void volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer,
+                                                         const lv_32fc_t* complexVector,
+                                                         unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  const float* complexVectorPtr = (const float*)complexVector;
-  float* iBufferPtr = iBuffer;
+    const float* complexVectorPtr = (const float*)complexVector;
+    float* iBufferPtr = iBuffer;
  
-  __m128 cplxValue1, cplxValue2, iValue;
-  for(;number < quarterPoints; number++){
+    __m128 cplxValue1, cplxValue2, iValue;
+    for (; number < quarterPoints; number++) {
  
-    cplxValue1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
+        cplxValue1 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
  
-    cplxValue2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
+        cplxValue2 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
  
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+        // Arrange in i1i2i3i4 format
+        iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
  
-    _mm_store_ps(iBufferPtr, iValue);
+        _mm_store_ps(iBufferPtr, iValue);
  
-    iBufferPtr += 4;
-  }
+        iBufferPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32fc_deinterleave_real_32f_generic(float* iBuffer, const lv_32fc_t* complexVector,
-                                        unsigned int num_points)
+static inline void volk_32fc_deinterleave_real_32f_generic(float* iBuffer,
+                                                           const lv_32fc_t* complexVector,
+                                                           unsigned int num_points)
  {
-  unsigned int number = 0;
-  const float* complexVectorPtr = (float*)complexVector;
-  float* iBufferPtr = iBuffer;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
+    unsigned int number = 0;
+    const float* complexVectorPtr = (float*)complexVector;
+    float* iBufferPtr = iBuffer;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -173,27 +173,27 @@ volk_32fc_deinterleave_real_32f_generic(float* iBuffer, const lv_32fc_t* complex
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_32fc_deinterleave_real_32f_neon(float* iBuffer, const lv_32fc_t* complexVector,
-                                     unsigned int num_points)
+static inline void volk_32fc_deinterleave_real_32f_neon(float* iBuffer,
+                                                        const lv_32fc_t* complexVector,
+                                                        unsigned int num_points)
  {
-  unsigned int number = 0;
-  unsigned int quarter_points = num_points / 4;
-  const float* complexVectorPtr = (float*)complexVector;
-  float* iBufferPtr = iBuffer;
-  float32x4x2_t complexInput;
-
-  for(number = 0; number < quarter_points; number++){
-    complexInput = vld2q_f32(complexVectorPtr);
-    vst1q_f32( iBufferPtr, complexInput.val[0] );
-    complexVectorPtr += 8;
-    iBufferPtr += 4;
-  }
-
-  for(number = quarter_points*4; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+    const float* complexVectorPtr = (float*)complexVector;
+    float* iBufferPtr = iBuffer;
+    float32x4x2_t complexInput;
+
+    for (number = 0; number < quarter_points; number++) {
+        complexInput = vld2q_f32(complexVectorPtr);
+        vst1q_f32(iBufferPtr, complexInput.val[0]);
+        complexVectorPtr += 8;
+        iBufferPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_NEON */
  
@@ -209,41 +209,41 @@ volk_32fc_deinterleave_real_32f_neon(float* iBuffer, const lv_32fc_t* complexVec
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_32fc_deinterleave_real_32f_u_avx2(float* iBuffer, const lv_32fc_t* complexVector,
-                                      unsigned int num_points)
+static inline void volk_32fc_deinterleave_real_32f_u_avx2(float* iBuffer,
+                                                          const lv_32fc_t* complexVector,
+                                                          unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  const float* complexVectorPtr = (const float*)complexVector;
-  float* iBufferPtr = iBuffer;
+    const float* complexVectorPtr = (const float*)complexVector;
+    float* iBufferPtr = iBuffer;
  
-  __m256 cplxValue1, cplxValue2;
-  __m256 iValue;
-  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
-  for(;number < eighthPoints; number++){
+    __m256 cplxValue1, cplxValue2;
+    __m256 iValue;
+    __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+    for (; number < eighthPoints; number++) {
  
-    cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
-    complexVectorPtr += 8;
+        cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 8;
  
-    cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
-    complexVectorPtr += 8;
+        cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 8;
  
-    // Arrange in i1i2i3i4 format
-    iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-    iValue = _mm256_permutevar8x32_ps(iValue,idx);
+        // Arrange in i1i2i3i4 format
+        iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+        iValue = _mm256_permutevar8x32_ps(iValue, idx);
  
-    _mm256_storeu_ps(iBufferPtr, iValue);
+        _mm256_storeu_ps(iBufferPtr, iValue);
  
-    iBufferPtr += 8;
-  }
+        iBufferPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
diff --git a/kernels/volk/volk_32fc_deinterleave_real_64f.h b/kernels/volk/volk_32fc_deinterleave_real_64f.h

index 3d6e90190c39b9a7a6c74c74f9f875cffb13e7f7..9ec7769f4a5feb5030490c486de0402d59ea4771 100644 (file)
--- a/kernels/volk/volk_32fc_deinterleave_real_64f.h
+++ b/kernels/volk/volk_32fc_deinterleave_real_64f.h
@@ -77,124 +77,132 @@
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void volk_32fc_deinterleave_real_64f_a_avx2(
-    double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) {
-  unsigned int number = 0;
-
-  const float *complexVectorPtr = (float *)complexVector;
-  double *iBufferPtr = iBuffer;
-
-  const unsigned int quarterPoints = num_points / 4;
-  __m256 cplxValue;
-  __m128 fVal;
-  __m256d dVal;
-  __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
-  for (; number < quarterPoints; number++) {
-
-    cplxValue = _mm256_load_ps(complexVectorPtr);
-    complexVectorPtr += 8;
-
-    // Arrange in i1i2i1i2 format
-    cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx);
-    fVal = _mm256_extractf128_ps(cplxValue, 0);
-    dVal = _mm256_cvtps_pd(fVal);
-    _mm256_store_pd(iBufferPtr, dVal);
-
-    iBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for (; number < num_points; number++) {
-    *iBufferPtr++ = (double)*complexVectorPtr++;
-    complexVectorPtr++;
-  }
+static inline void volk_32fc_deinterleave_real_64f_a_avx2(double* iBuffer,
+                                                          const lv_32fc_t* complexVector,
+                                                          unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+
+    const unsigned int quarterPoints = num_points / 4;
+    __m256 cplxValue;
+    __m128 fVal;
+    __m256d dVal;
+    __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
+    for (; number < quarterPoints; number++) {
+
+        cplxValue = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        // Arrange in i1i2i1i2 format
+        cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx);
+        fVal = _mm256_extractf128_ps(cplxValue, 0);
+        dVal = _mm256_cvtps_pd(fVal);
+        _mm256_store_pd(iBufferPtr, dVal);
+
+        iBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (double)*complexVectorPtr++;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void volk_32fc_deinterleave_real_64f_a_sse2(
-    double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) {
-  unsigned int number = 0;
+static inline void volk_32fc_deinterleave_real_64f_a_sse2(double* iBuffer,
+                                                          const lv_32fc_t* complexVector,
+                                                          unsigned int num_points)
+{
+    unsigned int number = 0;
  
-  const float *complexVectorPtr = (float *)complexVector;
-  double *iBufferPtr = iBuffer;
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
  
-  const unsigned int halfPoints = num_points / 2;
-  __m128 cplxValue, fVal;
-  __m128d dVal;
-  for (; number < halfPoints; number++) {
+    const unsigned int halfPoints = num_points / 2;
+    __m128 cplxValue, fVal;
+    __m128d dVal;
+    for (; number < halfPoints; number++) {
  
-    cplxValue = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
+        cplxValue = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
  
-    // Arrange in i1i2i1i2 format
-    fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0));
-    dVal = _mm_cvtps_pd(fVal);
-    _mm_store_pd(iBufferPtr, dVal);
+        // Arrange in i1i2i1i2 format
+        fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0));
+        dVal = _mm_cvtps_pd(fVal);
+        _mm_store_pd(iBufferPtr, dVal);
  
-    iBufferPtr += 2;
-  }
+        iBufferPtr += 2;
+    }
  
-  number = halfPoints * 2;
-  for (; number < num_points; number++) {
-    *iBufferPtr++ = (double)*complexVectorPtr++;
-    complexVectorPtr++;
-  }
+    number = halfPoints * 2;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (double)*complexVectorPtr++;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_32fc_deinterleave_real_64f_generic(
-    double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) {
-  unsigned int number = 0;
-  const float *complexVectorPtr = (float *)complexVector;
-  double *iBufferPtr = iBuffer;
-  for (number = 0; number < num_points; number++) {
-    *iBufferPtr++ = (double)*complexVectorPtr++;
-    complexVectorPtr++;
-  }
+static inline void volk_32fc_deinterleave_real_64f_generic(double* iBuffer,
+                                                           const lv_32fc_t* complexVector,
+                                                           unsigned int num_points)
+{
+    unsigned int number = 0;
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = (double)*complexVectorPtr++;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  #ifdef LV_HAVE_NEONV8
  #include <arm_neon.h>
  
-static inline void volk_32fc_deinterleave_real_64f_neon(
-    double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) {
-  unsigned int number = 0;
-  unsigned int quarter_points = num_points / 4;
-  const float *complexVectorPtr = (float *)complexVector;
-  double *iBufferPtr = iBuffer;
-  float32x2x4_t complexInput;
-  float64x2_t iVal1;
-  float64x2_t iVal2;
-  float64x2x2_t iVal;
-
-  for (number = 0; number < quarter_points; number++) {
-    // Load data into register
-    complexInput = vld4_f32(complexVectorPtr);
-
-    // Perform single to double precision conversion
-    iVal1 = vcvt_f64_f32(complexInput.val[0]);
-    iVal2 = vcvt_f64_f32(complexInput.val[2]);
-    iVal.val[0] = iVal1;
-    iVal.val[1] = iVal2;
-
-    // Store results into memory buffer
-    vst2q_f64(iBufferPtr, iVal);
-
-    // Update pointers
-    iBufferPtr += 4;
-    complexVectorPtr += 8;
-  }
-
-  for (number = quarter_points * 4; number < num_points; number++) {
-    *iBufferPtr++ = (double)*complexVectorPtr++;
-    complexVectorPtr++;
-  }
+static inline void volk_32fc_deinterleave_real_64f_neon(double* iBuffer,
+                                                        const lv_32fc_t* complexVector,
+                                                        unsigned int num_points)
+{
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    float32x2x4_t complexInput;
+    float64x2_t iVal1;
+    float64x2_t iVal2;
+    float64x2x2_t iVal;
+
+    for (number = 0; number < quarter_points; number++) {
+        // Load data into register
+        complexInput = vld4_f32(complexVectorPtr);
+
+        // Perform single to double precision conversion
+        iVal1 = vcvt_f64_f32(complexInput.val[0]);
+        iVal2 = vcvt_f64_f32(complexInput.val[2]);
+        iVal.val[0] = iVal1;
+        iVal.val[1] = iVal2;
+
+        // Store results into memory buffer
+        vst2q_f64(iBufferPtr, iVal);
+
+        // Update pointers
+        iBufferPtr += 4;
+        complexVectorPtr += 8;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *iBufferPtr++ = (double)*complexVectorPtr++;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_NEON */
  
@@ -209,37 +217,39 @@ static inline void volk_32fc_deinterleave_real_64f_neon(
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void volk_32fc_deinterleave_real_64f_u_avx2(
-    double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) {
-  unsigned int number = 0;
-
-  const float *complexVectorPtr = (float *)complexVector;
-  double *iBufferPtr = iBuffer;
-
-  const unsigned int quarterPoints = num_points / 4;
-  __m256 cplxValue;
-  __m128 fVal;
-  __m256d dVal;
-  __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
-  for (; number < quarterPoints; number++) {
-
-    cplxValue = _mm256_loadu_ps(complexVectorPtr);
-    complexVectorPtr += 8;
-
-    // Arrange in i1i2i1i2 format
-    cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx);
-    fVal = _mm256_extractf128_ps(cplxValue, 0);
-    dVal = _mm256_cvtps_pd(fVal);
-    _mm256_storeu_pd(iBufferPtr, dVal);
-
-    iBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for (; number < num_points; number++) {
-    *iBufferPtr++ = (double)*complexVectorPtr++;
-    complexVectorPtr++;
-  }
+static inline void volk_32fc_deinterleave_real_64f_u_avx2(double* iBuffer,
+                                                          const lv_32fc_t* complexVector,
+                                                          unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+
+    const unsigned int quarterPoints = num_points / 4;
+    __m256 cplxValue;
+    __m128 fVal;
+    __m256d dVal;
+    __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
+    for (; number < quarterPoints; number++) {
+
+        cplxValue = _mm256_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        // Arrange in i1i2i1i2 format
+        cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx);
+        fVal = _mm256_extractf128_ps(cplxValue, 0);
+        dVal = _mm256_cvtps_pd(fVal);
+        _mm256_storeu_pd(iBufferPtr, dVal);
+
+        iBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (double)*complexVectorPtr++;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
diff --git a/kernels/volk/volk_32fc_index_max_16u.h b/kernels/volk/volk_32fc_index_max_16u.h

index a9f95085ba9b829163a3801dcf1f0c01eaf3e8e1..b9f9cfd6b4d67a9a65d4f69f262b620824c1b715 100644 (file)
--- a/kernels/volk/volk_32fc_index_max_16u.h
+++ b/kernels/volk/volk_32fc_index_max_16u.h
@@ -76,346 +76,353 @@
  #ifndef INCLUDED_volk_32fc_index_max_16u_a_H
  #define INCLUDED_volk_32fc_index_max_16u_a_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
-#include <stdio.h>
  #include <limits.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
  #include <volk/volk_complex.h>
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
  static inline void
-volk_32fc_index_max_16u_a_avx2(uint16_t* target, lv_32fc_t* src0,
-                               uint32_t num_points)
+volk_32fc_index_max_16u_a_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
  {
-  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
-  // Branchless version, if we think it'll make a difference
-  //num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
-
-  const uint32_t num_bytes = num_points*8;
-
-  union bit256 holderf;
-  union bit256 holderi;
-  float sq_dist = 0.0;
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+    // Branchless version, if we think it'll make a difference
+    // num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
  
-  union bit256 xmm5, xmm4;
-  __m256 xmm1, xmm2, xmm3;
-  __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+    const uint32_t num_bytes = num_points * 8;
  
-  xmm5.int_vec = xmmfive = _mm256_setzero_si256();
-  xmm4.int_vec = xmmfour = _mm256_setzero_si256();
-  holderf.int_vec = holder0 = _mm256_setzero_si256();
-  holderi.int_vec = holder1 = _mm256_setzero_si256();
+    union bit256 holderf;
+    union bit256 holderi;
+    float sq_dist = 0.0;
  
-  int bound = num_bytes >> 6;
-  int i = 0;
+    union bit256 xmm5, xmm4;
+    __m256 xmm1, xmm2, xmm3;
+    __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
  
-  xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-  xmm9 =  _mm256_setzero_si256(); //=xmm8
-  xmm10 = _mm256_set1_epi32(8);
-  xmm3 = _mm256_setzero_ps();
+    xmm5.int_vec = xmmfive = _mm256_setzero_si256();
+    xmm4.int_vec = xmmfour = _mm256_setzero_si256();
+    holderf.int_vec = holder0 = _mm256_setzero_si256();
+    holderi.int_vec = holder1 = _mm256_setzero_si256();
  
-  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
-  for(; i < bound; ++i) {
-    xmm1 = _mm256_load_ps((float*)src0);
-    xmm2 = _mm256_load_ps((float*)&src0[4]);
+    int bound = num_bytes >> 6;
+    int i = 0;
  
-    src0 += 8;
+    xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+    xmm9 = _mm256_setzero_si256(); //=xmm8
+    xmm10 = _mm256_set1_epi32(8);
+    xmm3 = _mm256_setzero_ps();
  
-    xmm1 = _mm256_mul_ps(xmm1, xmm1);
-    xmm2 = _mm256_mul_ps(xmm2, xmm2);
+    __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+    for (; i < bound; ++i) {
+        xmm1 = _mm256_load_ps((float*)src0);
+        xmm2 = _mm256_load_ps((float*)&src0[4]);
  
-    xmm1 = _mm256_hadd_ps(xmm1, xmm2);
-    xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
+        src0 += 8;
  
-    xmm3 = _mm256_max_ps(xmm1, xmm3);
+        xmm1 = _mm256_mul_ps(xmm1, xmm1);
+        xmm2 = _mm256_mul_ps(xmm2, xmm2);
  
-    xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
-    xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+        xmm1 = _mm256_hadd_ps(xmm1, xmm2);
+        xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
  
-    xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
-    xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+        xmm3 = _mm256_max_ps(xmm1, xmm3);
  
-    xmm9 = _mm256_add_epi32(xmm11,  xmm12);
+        xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+        xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
  
-    xmm8 = _mm256_add_epi32(xmm8, xmm10);
-  }
-  xmm10 = _mm256_set1_epi32(4);
-  if (num_bytes >> 5 & 1) {
-    xmm1 = _mm256_load_ps((float*)src0);
+        xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+        xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
  
-    src0 += 4;
+        xmm9 = _mm256_add_epi32(xmm11, xmm12);
  
-    xmm1 = _mm256_mul_ps(xmm1, xmm1);
+        xmm8 = _mm256_add_epi32(xmm8, xmm10);
+    }
+    xmm10 = _mm256_set1_epi32(4);
+    if (num_bytes >> 5 & 1) {
+        xmm1 = _mm256_load_ps((float*)src0);
  
-    xmm1 = _mm256_hadd_ps(xmm1, xmm1);
-    xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
+        src0 += 4;
  
-    xmm3 = _mm256_max_ps(xmm1, xmm3);
+        xmm1 = _mm256_mul_ps(xmm1, xmm1);
  
-    xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
-    xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+        xmm1 = _mm256_hadd_ps(xmm1, xmm1);
+        xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
  
-    xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
-    xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+        xmm3 = _mm256_max_ps(xmm1, xmm3);
  
-    xmm9 = _mm256_add_epi32(xmm11,  xmm12);
+        xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+        xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
  
-    xmm8 = _mm256_add_epi32(xmm8, xmm10);
-  }
+        xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+        xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
  
-  idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
-  xmm10 = _mm256_set1_epi32(2);
-  if (num_bytes >> 4 & 1) {
-      xmm2 = _mm256_load_ps((float*)src0);
+        xmm9 = _mm256_add_epi32(xmm11, xmm12);
  
-      xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
-      xmm8 = bit256_p(&xmm1)->int_vec;
+        xmm8 = _mm256_add_epi32(xmm8, xmm10);
+    }
  
-      xmm2 = _mm256_mul_ps(xmm2, xmm2);
+    idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0);
+    xmm10 = _mm256_set1_epi32(2);
+    if (num_bytes >> 4 & 1) {
+        xmm2 = _mm256_load_ps((float*)src0);
  
-      src0 += 2;
+        xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
+        xmm8 = bit256_p(&xmm1)->int_vec;
  
-      xmm1 = _mm256_hadd_ps(xmm2, xmm2);
+        xmm2 = _mm256_mul_ps(xmm2, xmm2);
  
-      xmm3 = _mm256_max_ps(xmm1, xmm3);
+        src0 += 2;
  
-      xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
-      xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
-
-      xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
-      xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+        xmm1 = _mm256_hadd_ps(xmm2, xmm2);
  
-      xmm9 = _mm256_add_epi32(xmm11, xmm12);
+        xmm3 = _mm256_max_ps(xmm1, xmm3);
  
-      xmm8 = _mm256_add_epi32(xmm8, xmm10);
-  }
+        xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+        xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
  
-  /*
-  idx = _mm256_setzero_si256();
-  for(i = 0; i < leftovers2; ++i) {
-    //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+        xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+        xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
  
-    sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
+        xmm9 = _mm256_add_epi32(xmm11, xmm12);
  
-    //xmm = _mm_load1_ps(&sq_dist);//insert?
-    xmm2 = _mm256_set1_ps(sq_dist);
-    //xmm2 = _mm256_insertf128_ps(xmm2, xmm, 0);
+        xmm8 = _mm256_add_epi32(xmm8, xmm10);
+    }
  
-    xmm1 = xmm3;
+    /*
+    idx = _mm256_setzero_si256();
+    for(i = 0; i < leftovers2; ++i) {
+      //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1],
+  ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
  
-    xmm3 = _mm256_max_ps(xmm3, xmm2);//only lowest 32bit value
-    xmm3 = _mm256_permutevar8x32_ps(xmm3, idx);
+      sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) *
+  lv_cimag(src0[0]);
  
-    xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
-    xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+      //xmm = _mm_load1_ps(&sq_dist);//insert?
+      xmm2 = _mm256_set1_ps(sq_dist);
+      //xmm2 = _mm256_insertf128_ps(xmm2, xmm, 0);
  
-    xmm8 = _mm256_permutevar8x32_epi32(xmm8, idx);
+      xmm1 = xmm3;
  
-    xmm11 = _mm256_and_si256(xmm8, xmm4.int_vec);
-    xmm12 = _mm256_and_si256(xmm9, xmm5.int_vec);
+      xmm3 = _mm256_max_ps(xmm3, xmm2);//only lowest 32bit value
+      xmm3 = _mm256_permutevar8x32_ps(xmm3, idx);
  
-    xmm9 = _mm256_add_epi32(xmm11, xmm12);
-}*/
+      xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+      xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
  
-  _mm256_store_ps((float*)&(holderf.f), xmm3);
-  _mm256_store_si256(&(holderi.int_vec), xmm9);
+      xmm8 = _mm256_permutevar8x32_epi32(xmm8, idx);
  
-  target[0] = holderi.i[0];
-  sq_dist = holderf.f[0];
-  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
-  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
-  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
-  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
-  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
-  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
-  target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
-  sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
-  target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
-  sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
-  target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
-  sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
-  target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
-  sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
+      xmm11 = _mm256_and_si256(xmm8, xmm4.int_vec);
+      xmm12 = _mm256_and_si256(xmm9, xmm5.int_vec);
  
+      xmm9 = _mm256_add_epi32(xmm11, xmm12);
+  }*/
+
+    _mm256_store_ps((float*)&(holderf.f), xmm3);
+    _mm256_store_si256(&(holderi.int_vec), xmm9);
+
+    target[0] = holderi.i[0];
+    sq_dist = holderf.f[0];
+    target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+    sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+    target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+    sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+    target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+    sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+    target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
+    sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
+    target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
+    sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
+    target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
+    sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
+    target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
+    sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
  }
  
  #endif /*LV_HAVE_AVX2*/
  
  #ifdef LV_HAVE_SSE3
-#include <xmmintrin.h>
  #include <pmmintrin.h>
+#include <xmmintrin.h>
  
  static inline void
-volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0,
-                               uint32_t num_points)
+volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
  {
-  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
-  // Branchless version, if we think it'll make a difference
-  //num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+    // Branchless version, if we think it'll make a difference
+    // num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
  
-  const uint32_t num_bytes = num_points*8;
+    const uint32_t num_bytes = num_points * 8;
  
-  union bit128 holderf;
-  union bit128 holderi;
-  float sq_dist = 0.0;
+    union bit128 holderf;
+    union bit128 holderi;
+    float sq_dist = 0.0;
  
-  union bit128 xmm5, xmm4;
-  __m128 xmm1, xmm2, xmm3;
-  __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+    union bit128 xmm5, xmm4;
+    __m128 xmm1, xmm2, xmm3;
+    __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
  
-  xmm5.int_vec = xmmfive = _mm_setzero_si128();
-  xmm4.int_vec = xmmfour = _mm_setzero_si128();
-  holderf.int_vec = holder0 = _mm_setzero_si128();
-  holderi.int_vec = holder1 = _mm_setzero_si128();
+    xmm5.int_vec = xmmfive = _mm_setzero_si128();
+    xmm4.int_vec = xmmfour = _mm_setzero_si128();
+    holderf.int_vec = holder0 = _mm_setzero_si128();
+    holderi.int_vec = holder1 = _mm_setzero_si128();
  
-  int bound = num_bytes >> 5;
-  int i = 0;
+    int bound = num_bytes >> 5;
+    int i = 0;
  
-  xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order!
-  xmm9 = _mm_setzero_si128();
-  xmm10 = _mm_set_epi32(4, 4, 4, 4);
-  xmm3 = _mm_setzero_ps();
-  //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
+    xmm8 = _mm_set_epi32(3, 2, 1, 0); // remember the crazy reverse order!
+    xmm9 = _mm_setzero_si128();
+    xmm10 = _mm_set_epi32(4, 4, 4, 4);
+    xmm3 = _mm_setzero_ps();
+    // printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1],
+    // ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
  
-  for(; i < bound; ++i) {
-    xmm1 = _mm_load_ps((float*)src0);
-    xmm2 = _mm_load_ps((float*)&src0[2]);
+    for (; i < bound; ++i) {
+        xmm1 = _mm_load_ps((float*)src0);
+        xmm2 = _mm_load_ps((float*)&src0[2]);
  
-    src0 += 4;
+        src0 += 4;
  
-    xmm1 = _mm_mul_ps(xmm1, xmm1);
-    xmm2 = _mm_mul_ps(xmm2, xmm2);
+        xmm1 = _mm_mul_ps(xmm1, xmm1);
+        xmm2 = _mm_mul_ps(xmm2, xmm2);
  
-    xmm1 = _mm_hadd_ps(xmm1, xmm2);
+        xmm1 = _mm_hadd_ps(xmm1, xmm2);
  
-    xmm3 = _mm_max_ps(xmm1, xmm3);
+        xmm3 = _mm_max_ps(xmm1, xmm3);
  
-    xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
-    xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+        xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+        xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
  
-    xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
-    xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+        xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+        xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
  
-    xmm9 = _mm_add_epi32(xmm11,  xmm12);
+        xmm9 = _mm_add_epi32(xmm11, xmm12);
  
-    xmm8 = _mm_add_epi32(xmm8, xmm10);
+        xmm8 = _mm_add_epi32(xmm8, xmm10);
  
-    //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
-    //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]);
-  }
+        // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1],
+        // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n",
+        // ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2],
+        // ((uint32_t*)&xmm10)[3]);
+    }
  
  
-  if (num_bytes >> 4 & 1) {
-    xmm2 = _mm_load_ps((float*)src0);
+    if (num_bytes >> 4 & 1) {
+        xmm2 = _mm_load_ps((float*)src0);
  
-    xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
-    xmm8 = bit128_p(&xmm1)->int_vec;
+        xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
+        xmm8 = bit128_p(&xmm1)->int_vec;
  
-    xmm2 = _mm_mul_ps(xmm2, xmm2);
+        xmm2 = _mm_mul_ps(xmm2, xmm2);
  
-    src0 += 2;
+        src0 += 2;
  
-    xmm1 = _mm_hadd_ps(xmm2, xmm2);
+        xmm1 = _mm_hadd_ps(xmm2, xmm2);
  
-    xmm3 = _mm_max_ps(xmm1, xmm3);
+        xmm3 = _mm_max_ps(xmm1, xmm3);
  
-    xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]);
+        xmm10 = _mm_set_epi32(2, 2, 2, 2); // load1_ps((float*)&init[2]);
  
-    xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
-    xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+        xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+        xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
  
-    xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
-    xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+        xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+        xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
  
-    xmm9 = _mm_add_epi32(xmm11, xmm12);
+        xmm9 = _mm_add_epi32(xmm11, xmm12);
  
-    xmm8 = _mm_add_epi32(xmm8, xmm10);
-    //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
-  }
+        xmm8 = _mm_add_epi32(xmm8, xmm10);
+        // printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1],
+        // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+    }
  
-  if (num_bytes >> 3 & 1) {
-    //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+    if (num_bytes >> 3 & 1) {
+        // printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1],
+        // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
  
-    sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
+        sq_dist =
+            lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
  
-    xmm2 = _mm_load1_ps(&sq_dist);
+        xmm2 = _mm_load1_ps(&sq_dist);
  
-    xmm1 = xmm3;
+        xmm1 = xmm3;
  
-    xmm3 = _mm_max_ss(xmm3, xmm2);
+        xmm3 = _mm_max_ss(xmm3, xmm2);
  
-    xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
-    xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+        xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+        xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
  
-    xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
+        xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
  
-    xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
-    xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
+        xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
+        xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
  
-    xmm9 = _mm_add_epi32(xmm11, xmm12);
-  }
+        xmm9 = _mm_add_epi32(xmm11, xmm12);
+    }
  
-  //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
-  //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+    // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1],
+    // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n",
+    // ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2],
+    // ((uint32_t*)&xmm9)[3]);
  
-  _mm_store_ps((float*)&(holderf.f), xmm3);
-  _mm_store_si128(&(holderi.int_vec), xmm9);
+    _mm_store_ps((float*)&(holderf.f), xmm3);
+    _mm_store_si128(&(holderi.int_vec), xmm9);
  
-  target[0] = holderi.i[0];
-  sq_dist = holderf.f[0];
-  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
-  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
-  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
-  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
-  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
-  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+    target[0] = holderi.i[0];
+    sq_dist = holderf.f[0];
+    target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+    sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+    target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+    sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+    target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+    sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
  
-  /*
-  float placeholder = 0.0;
-  uint32_t temp0, temp1;
-  uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
-  uint32_t l0 = g0 ^ 1;
+    /*
+    float placeholder = 0.0;
+    uint32_t temp0, temp1;
+    uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
+    uint32_t l0 = g0 ^ 1;
  
-  uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
-  uint32_t l1 = g1 ^ 1;
+    uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
+    uint32_t l1 = g1 ^ 1;
  
-  temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
-  temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
-  sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
-  placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
+    temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
+    temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
+    sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
+    placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
  
-  g0 = (sq_dist > placeholder);
-  l0 = g0 ^ 1;
-  target[0] = g0 * temp0 + l0 * temp1;
-  */
+    g0 = (sq_dist > placeholder);
+    l0 = g0 ^ 1;
+    target[0] = g0 * temp0 + l0 * temp1;
+    */
  }
  
  #endif /*LV_HAVE_SSE3*/
  
  #ifdef LV_HAVE_GENERIC
  static inline void
- volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0,
-                                 uint32_t num_points)
+volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
  {
-  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
  
-  const uint32_t num_bytes = num_points*8;
+    const uint32_t num_bytes = num_points * 8;
  
-  float sq_dist = 0.0;
-  float max = 0.0;
-  uint16_t index = 0;
+    float sq_dist = 0.0;
+    float max = 0.0;
+    uint16_t index = 0;
  
-  uint32_t i = 0;
+    uint32_t i = 0;
  
-  for(; i < num_bytes >> 3; ++i) {
-    sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
+    for (; i<num_bytes>> 3; ++i) {
+        sq_dist =
+            lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
  
-    index = sq_dist > max ? i : index;
-    max = sq_dist > max ? sq_dist : max;
-  }
-  target[0] = index;
+        index = sq_dist > max ? i : index;
+        max = sq_dist > max ? sq_dist : max;
+    }
+    target[0] = index;
  }
  
  #endif /*LV_HAVE_GENERIC*/
@@ -427,142 +434,140 @@ static inline void
  #ifndef INCLUDED_volk_32fc_index_max_16u_u_H
  #define INCLUDED_volk_32fc_index_max_16u_u_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
-#include <stdio.h>
  #include <limits.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
  #include <volk/volk_complex.h>
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
  static inline void
-volk_32fc_index_max_16u_u_avx2(uint16_t* target, lv_32fc_t* src0,
-                               uint32_t num_points)
+volk_32fc_index_max_16u_u_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
  {
-  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
-  // Branchless version, if we think it'll make a difference
-  //num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+    // Branchless version, if we think it'll make a difference
+    // num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
  
-  const uint32_t num_bytes = num_points*8;
+    const uint32_t num_bytes = num_points * 8;
  
-  union bit256 holderf;
-  union bit256 holderi;
-  float sq_dist = 0.0;
+    union bit256 holderf;
+    union bit256 holderi;
+    float sq_dist = 0.0;
  
-  union bit256 xmm5, xmm4;
-  __m256 xmm1, xmm2, xmm3;
-  __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+    union bit256 xmm5, xmm4;
+    __m256 xmm1, xmm2, xmm3;
+    __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
  
-  xmm5.int_vec = xmmfive = _mm256_setzero_si256();
-  xmm4.int_vec = xmmfour = _mm256_setzero_si256();
-  holderf.int_vec = holder0 = _mm256_setzero_si256();
-  holderi.int_vec = holder1 = _mm256_setzero_si256();
+    xmm5.int_vec = xmmfive = _mm256_setzero_si256();
+    xmm4.int_vec = xmmfour = _mm256_setzero_si256();
+    holderf.int_vec = holder0 = _mm256_setzero_si256();
+    holderi.int_vec = holder1 = _mm256_setzero_si256();
  
-  int bound = num_bytes >> 6;
-  int i = 0;
+    int bound = num_bytes >> 6;
+    int i = 0;
  
-  xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-  xmm9 =  _mm256_setzero_si256(); //=xmm8
-  xmm10 = _mm256_set1_epi32(8);
-  xmm3 = _mm256_setzero_ps();
+    xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+    xmm9 = _mm256_setzero_si256(); //=xmm8
+    xmm10 = _mm256_set1_epi32(8);
+    xmm3 = _mm256_setzero_ps();
  
-  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
-  for(; i < bound; ++i) {
-    xmm1 = _mm256_loadu_ps((float*)src0);
-    xmm2 = _mm256_loadu_ps((float*)&src0[4]);
+    __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+    for (; i < bound; ++i) {
+        xmm1 = _mm256_loadu_ps((float*)src0);
+        xmm2 = _mm256_loadu_ps((float*)&src0[4]);
  
-    src0 += 8;
+        src0 += 8;
  
-    xmm1 = _mm256_mul_ps(xmm1, xmm1);
-    xmm2 = _mm256_mul_ps(xmm2, xmm2);
+        xmm1 = _mm256_mul_ps(xmm1, xmm1);
+        xmm2 = _mm256_mul_ps(xmm2, xmm2);
  
-    xmm1 = _mm256_hadd_ps(xmm1, xmm2);
-    xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
+        xmm1 = _mm256_hadd_ps(xmm1, xmm2);
+        xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
  
-    xmm3 = _mm256_max_ps(xmm1, xmm3);
+        xmm3 = _mm256_max_ps(xmm1, xmm3);
  
-    xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
-    xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+        xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+        xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
  
-    xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
-    xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+        xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+        xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
  
-    xmm9 = _mm256_add_epi32(xmm11,  xmm12);
+        xmm9 = _mm256_add_epi32(xmm11, xmm12);
  
-    xmm8 = _mm256_add_epi32(xmm8, xmm10);
-  }
-  xmm10 = _mm256_set1_epi32(4);
-  if (num_bytes >> 5 & 1) {
-    xmm1 = _mm256_loadu_ps((float*)src0);
+        xmm8 = _mm256_add_epi32(xmm8, xmm10);
+    }
+    xmm10 = _mm256_set1_epi32(4);
+    if (num_bytes >> 5 & 1) {
+        xmm1 = _mm256_loadu_ps((float*)src0);
  
-    src0 += 4;
+        src0 += 4;
  
-    xmm1 = _mm256_mul_ps(xmm1, xmm1);
+        xmm1 = _mm256_mul_ps(xmm1, xmm1);
  
-    xmm1 = _mm256_hadd_ps(xmm1, xmm1);
-    xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
+        xmm1 = _mm256_hadd_ps(xmm1, xmm1);
+        xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
  
-    xmm3 = _mm256_max_ps(xmm1, xmm3);
+        xmm3 = _mm256_max_ps(xmm1, xmm3);
  
-    xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
-    xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+        xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+        xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
  
-    xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
-    xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+        xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+        xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
  
-    xmm9 = _mm256_add_epi32(xmm11,  xmm12);
+        xmm9 = _mm256_add_epi32(xmm11, xmm12);
  
-    xmm8 = _mm256_add_epi32(xmm8, xmm10);
-  }
+        xmm8 = _mm256_add_epi32(xmm8, xmm10);
+    }
  
-  idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
-  xmm10 = _mm256_set1_epi32(2);
-  if (num_bytes >> 4 & 1) {
-      xmm2 = _mm256_loadu_ps((float*)src0);
+    idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0);
+    xmm10 = _mm256_set1_epi32(2);
+    if (num_bytes >> 4 & 1) {
+        xmm2 = _mm256_loadu_ps((float*)src0);
  
-      xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
-      xmm8 = bit256_p(&xmm1)->int_vec;
+        xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
+        xmm8 = bit256_p(&xmm1)->int_vec;
  
-      xmm2 = _mm256_mul_ps(xmm2, xmm2);
+        xmm2 = _mm256_mul_ps(xmm2, xmm2);
  
-      src0 += 2;
+        src0 += 2;
  
-      xmm1 = _mm256_hadd_ps(xmm2, xmm2);
+        xmm1 = _mm256_hadd_ps(xmm2, xmm2);
  
-      xmm3 = _mm256_max_ps(xmm1, xmm3);
+        xmm3 = _mm256_max_ps(xmm1, xmm3);
  
-      xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
-      xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+        xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+        xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
  
-      xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
-      xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+        xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+        xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
  
-      xmm9 = _mm256_add_epi32(xmm11, xmm12);
+        xmm9 = _mm256_add_epi32(xmm11, xmm12);
+
+        xmm8 = _mm256_add_epi32(xmm8, xmm10);
+    }
  
-      xmm8 = _mm256_add_epi32(xmm8, xmm10);
-  }
-
-  _mm256_storeu_ps((float*)&(holderf.f), xmm3);
-  _mm256_storeu_si256(&(holderi.int_vec), xmm9);
-
-  target[0] = holderi.i[0];
-  sq_dist = holderf.f[0];
-  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
-  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
-  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
-  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
-  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
-  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
-  target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
-  sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
-  target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
-  sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
-  target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
-  sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
-  target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
-  sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
+    _mm256_storeu_ps((float*)&(holderf.f), xmm3);
+    _mm256_storeu_si256(&(holderi.int_vec), xmm9);
  
+    target[0] = holderi.i[0];
+    sq_dist = holderf.f[0];
+    target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+    sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+    target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+    sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+    target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+    sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+    target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
+    sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
+    target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
+    sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
+    target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
+    sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
+    target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
+    sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
  }
  
  #endif /*LV_HAVE_AVX2*/
diff --git a/kernels/volk/volk_32fc_index_max_32u.h b/kernels/volk/volk_32fc_index_max_32u.h

index 67a3faa8e61c5f18e195fcc40979b910957c42b8..7756fc6408a8ff3f2a887e1ad41daefcbb6f4f63 100644 (file)
--- a/kernels/volk/volk_32fc_index_max_32u.h
+++ b/kernels/volk/volk_32fc_index_max_32u.h
@@ -70,309 +70,314 @@
  #ifndef INCLUDED_volk_32fc_index_max_32u_a_H
  #define INCLUDED_volk_32fc_index_max_32u_a_H
  
+#include <inttypes.h>
+#include <stdio.h>
  #include <volk/volk_common.h>
-#include<inttypes.h>
-#include<stdio.h>
-#include<volk/volk_complex.h>
+#include <volk/volk_complex.h>
  
  #ifdef LV_HAVE_AVX2
-#include<immintrin.h>
+#include <immintrin.h>
  
  static inline void
-volk_32fc_index_max_32u_a_avx2(uint32_t* target, lv_32fc_t* src0,
-                               uint32_t num_points)
+volk_32fc_index_max_32u_a_avx2(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
  {
-  const uint32_t num_bytes = num_points*8;
+    const uint32_t num_bytes = num_points * 8;
  
-  union bit256 holderf;
-  union bit256 holderi;
-  float sq_dist = 0.0;
+    union bit256 holderf;
+    union bit256 holderi;
+    float sq_dist = 0.0;
  
-  union bit256 xmm5, xmm4;
-  __m256 xmm1, xmm2, xmm3;
-  __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+    union bit256 xmm5, xmm4;
+    __m256 xmm1, xmm2, xmm3;
+    __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
  
-  xmm5.int_vec = xmmfive = _mm256_setzero_si256();
-  xmm4.int_vec = xmmfour = _mm256_setzero_si256();
-  holderf.int_vec = holder0 = _mm256_setzero_si256();
-  holderi.int_vec = holder1 = _mm256_setzero_si256();
+    xmm5.int_vec = xmmfive = _mm256_setzero_si256();
+    xmm4.int_vec = xmmfour = _mm256_setzero_si256();
+    holderf.int_vec = holder0 = _mm256_setzero_si256();
+    holderi.int_vec = holder1 = _mm256_setzero_si256();
  
-  int bound = num_bytes >> 6;
-  int i = 0;
+    int bound = num_bytes >> 6;
+    int i = 0;
  
-  xmm8 = _mm256_set_epi32(7,6,5,4,3, 2, 1, 0);
-  xmm9 = _mm256_setzero_si256();
-  xmm10 = _mm256_set1_epi32(8);
-  xmm3 = _mm256_setzero_ps();
-  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
+    xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+    xmm9 = _mm256_setzero_si256();
+    xmm10 = _mm256_set1_epi32(8);
+    xmm3 = _mm256_setzero_ps();
+    __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
  
-  for(; i < bound; ++i) {
-    xmm1 = _mm256_load_ps((float*)src0);
-    xmm2 = _mm256_load_ps((float*)&src0[4]);
+    for (; i < bound; ++i) {
+        xmm1 = _mm256_load_ps((float*)src0);
+        xmm2 = _mm256_load_ps((float*)&src0[4]);
  
-    src0 += 8;
+        src0 += 8;
  
-    xmm1 = _mm256_mul_ps(xmm1, xmm1);
-    xmm2 = _mm256_mul_ps(xmm2, xmm2);
+        xmm1 = _mm256_mul_ps(xmm1, xmm1);
+        xmm2 = _mm256_mul_ps(xmm2, xmm2);
  
-    xmm1 = _mm256_hadd_ps(xmm1, xmm2);
-    xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
+        xmm1 = _mm256_hadd_ps(xmm1, xmm2);
+        xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
  
-    xmm3 = _mm256_max_ps(xmm1, xmm3);
+        xmm3 = _mm256_max_ps(xmm1, xmm3);
  
-    xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
-    xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+        xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+        xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
  
-    xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
-    xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+        xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+        xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
  
-    xmm9 = _mm256_add_epi32(xmm11,  xmm12);
+        xmm9 = _mm256_add_epi32(xmm11, xmm12);
  
-    xmm8 = _mm256_add_epi32(xmm8, xmm10);
-  }
-
-  xmm10 = _mm256_set1_epi32(4);
-  if (num_bytes >> 5 & 1) {
-    xmm1 = _mm256_load_ps((float*)src0);
-
-    xmm1 = _mm256_mul_ps(xmm1, xmm1);
+        xmm8 = _mm256_add_epi32(xmm8, xmm10);
+    }
  
-    src0 += 4;
+    xmm10 = _mm256_set1_epi32(4);
+    if (num_bytes >> 4 & 1) {
+        xmm1 = _mm256_load_ps((float*)src0);
  
-    xmm1 = _mm256_hadd_ps(xmm1, xmm1);
+        xmm1 = _mm256_mul_ps(xmm1, xmm1);
  
-    xmm3 = _mm256_max_ps(xmm1, xmm3);
+        src0 += 4;
  
-    xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
-    xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+        xmm1 = _mm256_hadd_ps(xmm1, xmm1);
  
-    xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
-    xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+        xmm3 = _mm256_max_ps(xmm1, xmm3);
  
-    xmm9 = _mm256_add_epi32(xmm11, xmm12);
+        xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+        xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
  
-    xmm8 = _mm256_add_epi32(xmm8, xmm10);
-  }
+        xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+        xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
  
-  idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
-  xmm10 = _mm256_set1_epi32(2);
-  if (num_bytes >> 4 & 1) {
-    xmm2 = _mm256_load_ps((float*)src0);
+        xmm9 = _mm256_add_epi32(xmm11, xmm12);
  
-    xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
-    xmm8 = bit256_p(&xmm1)->int_vec;
+        xmm8 = _mm256_add_epi32(xmm8, xmm10);
+    }
  
-    xmm2 = _mm256_mul_ps(xmm2, xmm2);
+    idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0);
+    xmm10 = _mm256_set1_epi32(2);
+    if (num_bytes >> 4 & 1) {
+        xmm2 = _mm256_load_ps((float*)src0);
  
-    src0 += 2;
+        xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
+        xmm8 = bit256_p(&xmm1)->int_vec;
  
-    xmm1 = _mm256_hadd_ps(xmm2, xmm2);
+        xmm2 = _mm256_mul_ps(xmm2, xmm2);
  
-    xmm3 = _mm256_max_ps(xmm1, xmm3);
+        src0 += 2;
  
-    xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
-    xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+        xmm1 = _mm256_hadd_ps(xmm2, xmm2);
  
-    xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
-    xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+        xmm3 = _mm256_max_ps(xmm1, xmm3);
  
-    xmm9 = _mm256_add_epi32(xmm11, xmm12);
+        xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+        xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
  
-    xmm8 = _mm256_add_epi32(xmm8, xmm10);
-  }
+        xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+        xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
  
-  _mm256_store_ps((float*)&(holderf.f), xmm3);
-  _mm256_store_si256(&(holderi.int_vec), xmm9);
+        xmm9 = _mm256_add_epi32(xmm11, xmm12);
  
-  target[0] = holderi.i[0];
-  sq_dist = holderf.f[0];
-  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
-  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
-  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
-  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
-  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
-  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
-  target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
-  sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
-  target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
-  sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
-  target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
-  sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
-  target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
-  sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
+        xmm8 = _mm256_add_epi32(xmm8, xmm10);
+    }
  
+    _mm256_store_ps((float*)&(holderf.f), xmm3);
+    _mm256_store_si256(&(holderi.int_vec), xmm9);
+
+    target[0] = holderi.i[0];
+    sq_dist = holderf.f[0];
+    target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+    sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+    target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+    sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+    target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+    sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+    target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
+    sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
+    target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
+    sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
+    target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
+    sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
+    target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
+    sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
  }
  
  #endif /*LV_HAVE_AVX2*/
  
  #ifdef LV_HAVE_SSE3
-#include<xmmintrin.h>
-#include<pmmintrin.h>
+#include <pmmintrin.h>
+#include <xmmintrin.h>
  
  static inline void
-volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0,
-                               uint32_t num_points)
+volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
  {
-  const uint32_t num_bytes = num_points*8;
-
-  union bit128 holderf;
-  union bit128 holderi;
-  float sq_dist = 0.0;
-
-  union bit128 xmm5, xmm4;
-  __m128 xmm1, xmm2, xmm3;
-  __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+    const uint32_t num_bytes = num_points * 8;
  
-  xmm5.int_vec = xmmfive = _mm_setzero_si128();
-  xmm4.int_vec = xmmfour = _mm_setzero_si128();
-  holderf.int_vec = holder0 = _mm_setzero_si128();
-  holderi.int_vec = holder1 = _mm_setzero_si128();
+    union bit128 holderf;
+    union bit128 holderi;
+    float sq_dist = 0.0;
  
-  int bound = num_bytes >> 5;
-  int i = 0;
+    union bit128 xmm5, xmm4;
+    __m128 xmm1, xmm2, xmm3;
+    __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
  
-  xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order!
-  xmm9 = _mm_setzero_si128();
-  xmm10 = _mm_set_epi32(4, 4, 4, 4);
-  xmm3 = _mm_setzero_ps();
+    xmm5.int_vec = xmmfive = _mm_setzero_si128();
+    xmm4.int_vec = xmmfour = _mm_setzero_si128();
+    holderf.int_vec = holder0 = _mm_setzero_si128();
+    holderi.int_vec = holder1 = _mm_setzero_si128();
  
-  //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
+    int bound = num_bytes >> 5;
+    int i = 0;
  
-  for(; i < bound; ++i) {
-    xmm1 = _mm_load_ps((float*)src0);
-    xmm2 = _mm_load_ps((float*)&src0[2]);
+    xmm8 = _mm_set_epi32(3, 2, 1, 0); // remember the crazy reverse order!
+    xmm9 = _mm_setzero_si128();
+    xmm10 = _mm_set_epi32(4, 4, 4, 4);
+    xmm3 = _mm_setzero_ps();
  
-    src0 += 4;
+    // printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1],
+    // ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
  
-    xmm1 = _mm_mul_ps(xmm1, xmm1);
-    xmm2 = _mm_mul_ps(xmm2, xmm2);
+    for (; i < bound; ++i) {
+        xmm1 = _mm_load_ps((float*)src0);
+        xmm2 = _mm_load_ps((float*)&src0[2]);
  
-    xmm1 = _mm_hadd_ps(xmm1, xmm2);
+        src0 += 4;
  
-    xmm3 = _mm_max_ps(xmm1, xmm3);
+        xmm1 = _mm_mul_ps(xmm1, xmm1);
+        xmm2 = _mm_mul_ps(xmm2, xmm2);
  
-    xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
-    xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+        xmm1 = _mm_hadd_ps(xmm1, xmm2);
  
-    xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
-    xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+        xmm3 = _mm_max_ps(xmm1, xmm3);
  
-    xmm9 = _mm_add_epi32(xmm11,  xmm12);
+        xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+        xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
  
-    xmm8 = _mm_add_epi32(xmm8, xmm10);
+        xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+        xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
  
-    //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
-    //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]);
-  }
+        xmm9 = _mm_add_epi32(xmm11, xmm12);
  
+        xmm8 = _mm_add_epi32(xmm8, xmm10);
  
-  if (num_bytes >> 4 & 1) {
-    xmm2 = _mm_load_ps((float*)src0);
-
-    xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
-    xmm8 = bit128_p(&xmm1)->int_vec;
-
-    xmm2 = _mm_mul_ps(xmm2, xmm2);
-
-    src0 += 2;
-
-    xmm1 = _mm_hadd_ps(xmm2, xmm2);
+        // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1],
+        // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n",
+        // ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2],
+        // ((uint32_t*)&xmm10)[3]);
+    }
  
-    xmm3 = _mm_max_ps(xmm1, xmm3);
  
-    xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]);
+    if (num_bytes >> 4 & 1) {
+        xmm2 = _mm_load_ps((float*)src0);
  
-    xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
-    xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+        xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
+        xmm8 = bit128_p(&xmm1)->int_vec;
  
-    xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
-    xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+        xmm2 = _mm_mul_ps(xmm2, xmm2);
  
-    xmm9 = _mm_add_epi32(xmm11, xmm12);
+        src0 += 2;
  
-    xmm8 = _mm_add_epi32(xmm8, xmm10);
-    //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
-  }
+        xmm1 = _mm_hadd_ps(xmm2, xmm2);
  
-  if (num_bytes >> 3 & 1) {
-    //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+        xmm3 = _mm_max_ps(xmm1, xmm3);
  
-    sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
+        xmm10 = _mm_set_epi32(2, 2, 2, 2); // load1_ps((float*)&init[2]);
  
-    xmm2 = _mm_load1_ps(&sq_dist);
+        xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+        xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
  
-    xmm1 = xmm3;
+        xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+        xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
  
-    xmm3 = _mm_max_ss(xmm3, xmm2);
+        xmm9 = _mm_add_epi32(xmm11, xmm12);
  
-    xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
-    xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+        xmm8 = _mm_add_epi32(xmm8, xmm10);
+        // printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1],
+        // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+    }
  
-    xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
+    if (num_bytes >> 3 & 1) {
+        // printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1],
+        // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
  
-    xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
-    xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
+        sq_dist =
+            lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
  
-    xmm9 = _mm_add_epi32(xmm11, xmm12);
-  }
+        xmm2 = _mm_load1_ps(&sq_dist);
  
-  //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
-  //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+        xmm1 = xmm3;
  
-  _mm_store_ps((float*)&(holderf.f), xmm3);
-  _mm_store_si128(&(holderi.int_vec), xmm9);
+        xmm3 = _mm_max_ss(xmm3, xmm2);
  
-  target[0] = holderi.i[0];
-  sq_dist = holderf.f[0];
-  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
-  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
-  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
-  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
-  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
-  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+        xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+        xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
  
-  /*
-  float placeholder = 0.0;
-  uint32_t temp0, temp1;
-  uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
-  uint32_t l0 = g0 ^ 1;
+        xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
  
-  uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
-  uint32_t l1 = g1 ^ 1;
+        xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
+        xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
  
-  temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
-  temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
-  sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
-  placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
+        xmm9 = _mm_add_epi32(xmm11, xmm12);
+    }
  
-  g0 = (sq_dist > placeholder);
-  l0 = g0 ^ 1;
-  target[0] = g0 * temp0 + l0 * temp1;
-  */
+    // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1],
+    // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n",
+    // ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2],
+    // ((uint32_t*)&xmm9)[3]);
+
+    _mm_store_ps((float*)&(holderf.f), xmm3);
+    _mm_store_si128(&(holderi.int_vec), xmm9);
+
+    target[0] = holderi.i[0];
+    sq_dist = holderf.f[0];
+    target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+    sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+    target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+    sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+    target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+    sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+
+    /*
+    float placeholder = 0.0;
+    uint32_t temp0, temp1;
+    uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
+    uint32_t l0 = g0 ^ 1;
+
+    uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
+    uint32_t l1 = g1 ^ 1;
+
+    temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
+    temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
+    sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
+    placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
+
+    g0 = (sq_dist > placeholder);
+    l0 = g0 ^ 1;
+    target[0] = g0 * temp0 + l0 * temp1;
+    */
  }
  
  #endif /*LV_HAVE_SSE3*/
  
  #ifdef LV_HAVE_GENERIC
  static inline void
- volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0,
-                                 uint32_t num_points)
+volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
  {
-  const uint32_t num_bytes = num_points*8;
+    const uint32_t num_bytes = num_points * 8;
  
-  float sq_dist = 0.0;
-  float max = 0.0;
-  uint32_t index = 0;
+    float sq_dist = 0.0;
+    float max = 0.0;
+    uint32_t index = 0;
  
-  uint32_t i = 0;
+    uint32_t i = 0;
  
-  for(; i < num_bytes >> 3; ++i) {
-    sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
+    for (; i<num_bytes>> 3; ++i) {
+        sq_dist =
+            lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
  
-    index = sq_dist > max ? i : index;
-    max = sq_dist > max ? sq_dist : max;
-  }
-  target[0] = index;
+        index = sq_dist > max ? i : index;
+        max = sq_dist > max ? sq_dist : max;
+    }
+    target[0] = index;
  }
  
  #endif /*LV_HAVE_GENERIC*/
@@ -384,137 +389,135 @@ static inline void
  #ifndef INCLUDED_volk_32fc_index_max_32u_u_H
  #define INCLUDED_volk_32fc_index_max_32u_u_H
  
+#include <inttypes.h>
+#include <stdio.h>
  #include <volk/volk_common.h>
-#include<inttypes.h>
-#include<stdio.h>
-#include<volk/volk_complex.h>
+#include <volk/volk_complex.h>
  
  #ifdef LV_HAVE_AVX2
-#include<immintrin.h>
+#include <immintrin.h>
  
  static inline void
-volk_32fc_index_max_32u_u_avx2(uint32_t* target, lv_32fc_t* src0,
-                               uint32_t num_points)
+volk_32fc_index_max_32u_u_avx2(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
  {
-  const uint32_t num_bytes = num_points*8;
-
-  union bit256 holderf;
-  union bit256 holderi;
-  float sq_dist = 0.0;
+    const uint32_t num_bytes = num_points * 8;
  
-  union bit256 xmm5, xmm4;
-  __m256 xmm1, xmm2, xmm3;
-  __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+    union bit256 holderf;
+    union bit256 holderi;
+    float sq_dist = 0.0;
  
-  xmm5.int_vec = xmmfive = _mm256_setzero_si256();
-  xmm4.int_vec = xmmfour = _mm256_setzero_si256();
-  holderf.int_vec = holder0 = _mm256_setzero_si256();
-  holderi.int_vec = holder1 = _mm256_setzero_si256();
+    union bit256 xmm5, xmm4;
+    __m256 xmm1, xmm2, xmm3;
+    __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
  
-  int bound = num_bytes >> 6;
-  int i = 0;
+    xmm5.int_vec = xmmfive = _mm256_setzero_si256();
+    xmm4.int_vec = xmmfour = _mm256_setzero_si256();
+    holderf.int_vec = holder0 = _mm256_setzero_si256();
+    holderi.int_vec = holder1 = _mm256_setzero_si256();
  
-  xmm8 = _mm256_set_epi32(7,6,5,4,3, 2, 1, 0);
-  xmm9 = _mm256_setzero_si256();
-  xmm10 = _mm256_set1_epi32(8);
-  xmm3 = _mm256_setzero_ps();
-  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
+    int bound = num_bytes >> 6;
+    int i = 0;
  
-  for(; i < bound; ++i) {
-    xmm1 = _mm256_loadu_ps((float*)src0);
-    xmm2 = _mm256_loadu_ps((float*)&src0[4]);
+    xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+    xmm9 = _mm256_setzero_si256();
+    xmm10 = _mm256_set1_epi32(8);
+    xmm3 = _mm256_setzero_ps();
+    __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
  
-    src0 += 8;
+    for (; i < bound; ++i) {
+        xmm1 = _mm256_loadu_ps((float*)src0);
+        xmm2 = _mm256_loadu_ps((float*)&src0[4]);
  
-    xmm1 = _mm256_mul_ps(xmm1, xmm1);
-    xmm2 = _mm256_mul_ps(xmm2, xmm2);
+        src0 += 8;
  
-    xmm1 = _mm256_hadd_ps(xmm1, xmm2);
-    xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
+        xmm1 = _mm256_mul_ps(xmm1, xmm1);
+        xmm2 = _mm256_mul_ps(xmm2, xmm2);
  
-    xmm3 = _mm256_max_ps(xmm1, xmm3);
+        xmm1 = _mm256_hadd_ps(xmm1, xmm2);
+        xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
  
-    xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
-    xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+        xmm3 = _mm256_max_ps(xmm1, xmm3);
  
-    xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
-    xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+        xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+        xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
  
-    xmm9 = _mm256_add_epi32(xmm11,  xmm12);
+        xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+        xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
  
-    xmm8 = _mm256_add_epi32(xmm8, xmm10);
-  }
+        xmm9 = _mm256_add_epi32(xmm11, xmm12);
  
-  xmm10 = _mm256_set1_epi32(4);
-  if (num_bytes >> 5 & 1) {
-    xmm1 = _mm256_loadu_ps((float*)src0);
-
-    xmm1 = _mm256_mul_ps(xmm1, xmm1);
+        xmm8 = _mm256_add_epi32(xmm8, xmm10);
+    }
  
-    src0 += 4;
+    xmm10 = _mm256_set1_epi32(4);
+    if (num_bytes >> 4 & 1) {
+        xmm1 = _mm256_loadu_ps((float*)src0);
  
-    xmm1 = _mm256_hadd_ps(xmm1, xmm1);
+        xmm1 = _mm256_mul_ps(xmm1, xmm1);
  
-    xmm3 = _mm256_max_ps(xmm1, xmm3);
+        src0 += 4;
  
-    xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
-    xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+        xmm1 = _mm256_hadd_ps(xmm1, xmm1);
  
-    xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
-    xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+        xmm3 = _mm256_max_ps(xmm1, xmm3);
  
-    xmm9 = _mm256_add_epi32(xmm11, xmm12);
+        xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+        xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
  
-    xmm8 = _mm256_add_epi32(xmm8, xmm10);
-  }
+        xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+        xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
  
-  idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
-  xmm10 = _mm256_set1_epi32(2);
-  if (num_bytes >> 4 & 1) {
-    xmm2 = _mm256_loadu_ps((float*)src0);
+        xmm9 = _mm256_add_epi32(xmm11, xmm12);
  
-    xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
-    xmm8 = bit256_p(&xmm1)->int_vec;
+        xmm8 = _mm256_add_epi32(xmm8, xmm10);
+    }
  
-    xmm2 = _mm256_mul_ps(xmm2, xmm2);
+    idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0);
+    xmm10 = _mm256_set1_epi32(2);
+    if (num_bytes >> 4 & 1) {
+        xmm2 = _mm256_loadu_ps((float*)src0);
  
-    src0 += 2;
+        xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
+        xmm8 = bit256_p(&xmm1)->int_vec;
  
-    xmm1 = _mm256_hadd_ps(xmm2, xmm2);
+        xmm2 = _mm256_mul_ps(xmm2, xmm2);
  
-    xmm3 = _mm256_max_ps(xmm1, xmm3);
+        src0 += 2;
  
-    xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
-    xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+        xmm1 = _mm256_hadd_ps(xmm2, xmm2);
  
-    xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
-    xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+        xmm3 = _mm256_max_ps(xmm1, xmm3);
  
-    xmm9 = _mm256_add_epi32(xmm11, xmm12);
+        xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+        xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
  
-    xmm8 = _mm256_add_epi32(xmm8, xmm10);
-  }
+        xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+        xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
  
-  _mm256_storeu_ps((float*)&(holderf.f), xmm3);
-  _mm256_storeu_si256(&(holderi.int_vec), xmm9);
+        xmm9 = _mm256_add_epi32(xmm11, xmm12);
  
-  target[0] = holderi.i[0];
-  sq_dist = holderf.f[0];
-  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
-  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
-  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
-  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
-  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
-  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
-  target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
-  sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
-  target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
-  sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
-  target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
-  sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
-  target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
-  sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
+        xmm8 = _mm256_add_epi32(xmm8, xmm10);
+    }
  
+    _mm256_storeu_ps((float*)&(holderf.f), xmm3);
+    _mm256_storeu_si256(&(holderi.int_vec), xmm9);
+
+    target[0] = holderi.i[0];
+    sq_dist = holderf.f[0];
+    target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+    sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+    target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+    sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+    target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+    sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+    target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
+    sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
+    target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
+    sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
+    target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
+    sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
+    target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
+    sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
  }
  
  #endif /*LV_HAVE_AVX2*/
@@ -523,29 +526,29 @@ volk_32fc_index_max_32u_u_avx2(uint32_t* target, lv_32fc_t* src0,
  #include <arm_neon.h>
  #include <volk/volk_neon_intrinsics.h>
  
-static inline void volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
+static inline void
+volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
  {
      unsigned int number = 0;
      const uint32_t quarter_points = num_points / 4;
      const lv_32fc_t* src0Ptr = src0;
-    
-    uint32_t indices[4] = {0, 1, 2, 3};
+
+    uint32_t indices[4] = { 0, 1, 2, 3 };
      const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
      uint32x4_t vec_indices = vld1q_u32(indices);
      uint32x4_t vec_max_indices = vec_indices;
-    
-    if(num_points)
-    {
+
+    if (num_points) {
          float max = *src0Ptr;
          uint32_t index = 0;
-        
+
          float32x4_t vec_max = vdupq_n_f32(*src0Ptr);
-        
-        for(;number < quarter_points; number++)
-        {
+
+        for (; number < quarter_points; number++) {
              // Load complex and compute magnitude squared
-            const float32x4_t vec_mag2 = _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr));
-            __VOLK_PREFETCH(src0Ptr+=4);
+            const float32x4_t vec_mag2 =
+                _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr));
+            __VOLK_PREFETCH(src0Ptr += 4);
              // a > b?
              const uint32x4_t gt_mask = vcgtq_f32(vec_mag2, vec_max);
              vec_max = vbslq_f32(gt_mask, vec_mag2, vec_max);
@@ -556,20 +559,19 @@ static inline void volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src
          float tmp_max[4];
          vst1q_u32(tmp_max_indices, vec_max_indices);
          vst1q_f32(tmp_max, vec_max);
-        
+
          for (int i = 0; i < 4; i++) {
              if (tmp_max[i] > max) {
                  max = tmp_max[i];
                  index = tmp_max_indices[i];
              }
          }
-        
+
          // Deal with the rest
-        for(number = quarter_points * 4;number < num_points; number++)
-        {
+        for (number = quarter_points * 4; number < num_points; number++) {
              const float re = lv_creal(*src0Ptr);
              const float im = lv_cimag(*src0Ptr);
-            if ((re*re+im*im) > max) {
+            if ((re * re + im * im) > max) {
                  max = *src0Ptr;
                  index = number;
              }
diff --git a/kernels/volk/volk_32fc_magnitude_32f.h b/kernels/volk/volk_32fc_magnitude_32f.h

index 1ba68711535d6e860962b502304ced64a3d74b47..6a0a7d84011bdfc1c25a7c0c4d2536fb23dbcced 100644 (file)
--- a/kernels/volk/volk_32fc_magnitude_32f.h
+++ b/kernels/volk/volk_32fc_magnitude_32f.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_magnitude_32f(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_32fc_magnitude_32f(float* magnitudeVector, const lv_32fc_t* complexVector,
+ * unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li complexVector: The complex input vector.
@@ -72,41 +72,41 @@
  #define INCLUDED_volk_32fc_magnitude_32f_u_H
  
  #include <inttypes.h>
-#include <stdio.h>
  #include <math.h>
+#include <stdio.h>
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  #include <volk/volk_avx_intrinsics.h>
  
-static inline void
-volk_32fc_magnitude_32f_u_avx(float* magnitudeVector, const lv_32fc_t* complexVector,
-                              unsigned int num_points)
+static inline void volk_32fc_magnitude_32f_u_avx(float* magnitudeVector,
+                                                 const lv_32fc_t* complexVector,
+                                                 unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  const float* complexVectorPtr = (float*) complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-
-  __m256 cplxValue1, cplxValue2, result;
-
-  for(; number < eighthPoints; number++){
-    cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
-    cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8);
-    result = _mm256_magnitude_ps(cplxValue1, cplxValue2);
-    _mm256_storeu_ps(magnitudeVectorPtr, result);
-
-    complexVectorPtr += 16;
-    magnitudeVectorPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    float val1Real = *complexVectorPtr++;
-    float val1Imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
-  }
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m256 cplxValue1, cplxValue2, result;
+
+    for (; number < eighthPoints; number++) {
+        cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+        cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8);
+        result = _mm256_magnitude_ps(cplxValue1, cplxValue2);
+        _mm256_storeu_ps(magnitudeVectorPtr, result);
+
+        complexVectorPtr += 16;
+        magnitudeVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
  }
  #endif /* LV_HAVE_AVX */
  
@@ -114,137 +114,137 @@ volk_32fc_magnitude_32f_u_avx(float* magnitudeVector, const lv_32fc_t* complexVe
  #include <pmmintrin.h>
  #include <volk/volk_sse3_intrinsics.h>
  
-static inline void
-volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector,
-                               unsigned int num_points)
+static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector,
+                                                  const lv_32fc_t* complexVector,
+                                                  unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  const float* complexVectorPtr = (float*) complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
  
-  __m128 cplxValue1, cplxValue2, result;
-  for(; number < quarterPoints; number++){
-    cplxValue1 = _mm_loadu_ps(complexVectorPtr);
-    complexVectorPtr += 4;
+    __m128 cplxValue1, cplxValue2, result;
+    for (; number < quarterPoints; number++) {
+        cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 4;
  
-    cplxValue2 = _mm_loadu_ps(complexVectorPtr);
-    complexVectorPtr += 4;
+        cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 4;
  
-    result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2);
+        result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2);
  
-    _mm_storeu_ps(magnitudeVectorPtr, result);
-    magnitudeVectorPtr += 4;
-  }
+        _mm_storeu_ps(magnitudeVectorPtr, result);
+        magnitudeVectorPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    float val1Real = *complexVectorPtr++;
-    float val1Imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
  }
  #endif /* LV_HAVE_SSE3 */
  
  
  #ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
  #include <volk/volk_sse_intrinsics.h>
+#include <xmmintrin.h>
  
-static inline void
-volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector,
-                              unsigned int num_points)
+static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector,
+                                                 const lv_32fc_t* complexVector,
+                                                 unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  const float* complexVectorPtr = (float*) complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
  
-  __m128 cplxValue1, cplxValue2, result;
+    __m128 cplxValue1, cplxValue2, result;
  
-  for(; number < quarterPoints; number++){
-    cplxValue1 = _mm_loadu_ps(complexVectorPtr);
-    complexVectorPtr += 4;
+    for (; number < quarterPoints; number++) {
+        cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 4;
  
-    cplxValue2 = _mm_loadu_ps(complexVectorPtr);
-    complexVectorPtr += 4;
+        cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 4;
  
-    result = _mm_magnitude_ps(cplxValue1, cplxValue2);
-    _mm_storeu_ps(magnitudeVectorPtr, result);
-    magnitudeVectorPtr += 4;
-  }
+        result = _mm_magnitude_ps(cplxValue1, cplxValue2);
+        _mm_storeu_ps(magnitudeVectorPtr, result);
+        magnitudeVectorPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    float val1Real = *complexVectorPtr++;
-    float val1Imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32fc_magnitude_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
+static inline void volk_32fc_magnitude_32f_generic(float* magnitudeVector,
+                                                   const lv_32fc_t* complexVector,
+                                                   unsigned int num_points)
  {
-  const float* complexVectorPtr = (float*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-  unsigned int number = 0;
-  for(number = 0; number < num_points; number++){
-    const float real = *complexVectorPtr++;
-    const float imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
-  }
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+    unsigned int number = 0;
+    for (number = 0; number < num_points; number++) {
+        const float real = *complexVectorPtr++;
+        const float imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
-
  #endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
  #ifndef INCLUDED_volk_32fc_magnitude_32f_a_H
  #define INCLUDED_volk_32fc_magnitude_32f_a_H
  
  #include <inttypes.h>
-#include <stdio.h>
  #include <math.h>
+#include <stdio.h>
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  #include <volk/volk_avx_intrinsics.h>
  
-static inline void
-volk_32fc_magnitude_32f_a_avx(float* magnitudeVector, const lv_32fc_t* complexVector,
-                              unsigned int num_points)
+static inline void volk_32fc_magnitude_32f_a_avx(float* magnitudeVector,
+                                                 const lv_32fc_t* complexVector,
+                                                 unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  const float* complexVectorPtr = (float*) complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-
-  __m256 cplxValue1, cplxValue2, result;
-  for(; number < eighthPoints; number++){
-    cplxValue1 = _mm256_load_ps(complexVectorPtr);
-    complexVectorPtr += 8;
-
-    cplxValue2 = _mm256_load_ps(complexVectorPtr);
-    complexVectorPtr += 8;
-
-    result = _mm256_magnitude_ps(cplxValue1, cplxValue2);
-    _mm256_store_ps(magnitudeVectorPtr, result);
-    magnitudeVectorPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    float val1Real = *complexVectorPtr++;
-    float val1Imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
-  }
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m256 cplxValue1, cplxValue2, result;
+    for (; number < eighthPoints; number++) {
+        cplxValue1 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        cplxValue2 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        result = _mm256_magnitude_ps(cplxValue1, cplxValue2);
+        _mm256_store_ps(magnitudeVectorPtr, result);
+        magnitudeVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
  }
  #endif /* LV_HAVE_AVX */
  
@@ -252,89 +252,89 @@ volk_32fc_magnitude_32f_a_avx(float* magnitudeVector, const lv_32fc_t* complexVe
  #include <pmmintrin.h>
  #include <volk/volk_sse3_intrinsics.h>
  
-static inline void
-volk_32fc_magnitude_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector,
-                               unsigned int num_points)
+static inline void volk_32fc_magnitude_32f_a_sse3(float* magnitudeVector,
+                                                  const lv_32fc_t* complexVector,
+                                                  unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* complexVectorPtr = (float*) complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-
-  __m128 cplxValue1, cplxValue2, result;
-  for(; number < quarterPoints; number++){
-    cplxValue1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    cplxValue2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2);
-    _mm_store_ps(magnitudeVectorPtr, result);
-    magnitudeVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    float val1Real = *complexVectorPtr++;
-    float val1Imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
-  }
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, result;
+    for (; number < quarterPoints; number++) {
+        cplxValue1 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        cplxValue2 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2);
+        _mm_store_ps(magnitudeVectorPtr, result);
+        magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
  }
  #endif /* LV_HAVE_SSE3 */
  
  #ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
  #include <volk/volk_sse_intrinsics.h>
+#include <xmmintrin.h>
  
-static inline void
-volk_32fc_magnitude_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector,
-                              unsigned int num_points)
+static inline void volk_32fc_magnitude_32f_a_sse(float* magnitudeVector,
+                                                 const lv_32fc_t* complexVector,
+                                                 unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* complexVectorPtr = (float*) complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-
-  __m128 cplxValue1, cplxValue2, result;
-  for(; number < quarterPoints; number++){
-    cplxValue1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    cplxValue2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    result = _mm_magnitude_ps(cplxValue1, cplxValue2);
-    _mm_store_ps(magnitudeVectorPtr, result);
-    magnitudeVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    float val1Real = *complexVectorPtr++;
-    float val1Imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
-  }
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, result;
+    for (; number < quarterPoints; number++) {
+        cplxValue1 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        cplxValue2 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        result = _mm_magnitude_ps(cplxValue1, cplxValue2);
+        _mm_store_ps(magnitudeVectorPtr, result);
+        magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32fc_magnitude_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector,
-                                  unsigned int num_points)
+static inline void volk_32fc_magnitude_32f_a_generic(float* magnitudeVector,
+                                                     const lv_32fc_t* complexVector,
+                                                     unsigned int num_points)
  {
-  const float* complexVectorPtr = (float*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-  unsigned int number = 0;
-  for(number = 0; number < num_points; number++){
-    const float real = *complexVectorPtr++;
-    const float imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
-  }
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+    unsigned int number = 0;
+    for (number = 0; number < num_points; number++) {
+        const float real = *complexVectorPtr++;
+        const float imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -342,41 +342,43 @@ volk_32fc_magnitude_32f_a_generic(float* magnitudeVector, const lv_32fc_t* compl
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_32fc_magnitude_32f_neon(float* magnitudeVector, const lv_32fc_t* complexVector,
-                             unsigned int num_points)
+static inline void volk_32fc_magnitude_32f_neon(float* magnitudeVector,
+                                                const lv_32fc_t* complexVector,
+                                                unsigned int num_points)
  {
-  unsigned int number;
-  unsigned int quarter_points = num_points / 4;
-  const float* complexVectorPtr = (float*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-
-  float32x4x2_t complex_vec;
-  float32x4_t magnitude_vec;
-  for(number = 0; number < quarter_points; number++){
-    complex_vec = vld2q_f32(complexVectorPtr);
-    complex_vec.val[0] = vmulq_f32(complex_vec.val[0], complex_vec.val[0]);
-    magnitude_vec = vmlaq_f32(complex_vec.val[0], complex_vec.val[1], complex_vec.val[1]);
-    magnitude_vec = vrsqrteq_f32(magnitude_vec);
-    magnitude_vec = vrecpeq_f32( magnitude_vec ); // no plain ol' sqrt
-    vst1q_f32(magnitudeVectorPtr, magnitude_vec);
-
-    complexVectorPtr += 8;
-    magnitudeVectorPtr += 4;
-  }
-
-  for(number = quarter_points*4; number < num_points; number++){
-    const float real = *complexVectorPtr++;
-    const float imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
-  }
+    unsigned int number;
+    unsigned int quarter_points = num_points / 4;
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    float32x4x2_t complex_vec;
+    float32x4_t magnitude_vec;
+    for (number = 0; number < quarter_points; number++) {
+        complex_vec = vld2q_f32(complexVectorPtr);
+        complex_vec.val[0] = vmulq_f32(complex_vec.val[0], complex_vec.val[0]);
+        magnitude_vec =
+            vmlaq_f32(complex_vec.val[0], complex_vec.val[1], complex_vec.val[1]);
+        magnitude_vec = vrsqrteq_f32(magnitude_vec);
+        magnitude_vec = vrecpeq_f32(magnitude_vec); // no plain ol' sqrt
+        vst1q_f32(magnitudeVectorPtr, magnitude_vec);
+
+        complexVectorPtr += 8;
+        magnitudeVectorPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        const float real = *complexVectorPtr++;
+        const float imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
+    }
  }
  #endif /* LV_HAVE_NEON */
  
  
  #ifdef LV_HAVE_NEON
  /*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \brief Calculates the magnitude of the complexVector and stores the results in the
+  magnitudeVector
  
    This is an approximation from "Streamlining Digital Signal Processing" by
    Richard Lyons. Apparently max error is about 1% and mean error is about 0.6%.
@@ -387,80 +389,80 @@ volk_32fc_magnitude_32f_neon(float* magnitudeVector, const lv_32fc_t* complexVec
  
    \param complexVector The vector containing the complex input values
    \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  \param num_points The number of complex values in complexVector to be calculated and
+  stored into cVector
  */
-static inline void
-volk_32fc_magnitude_32f_neon_fancy_sweet(float* magnitudeVector, const lv_32fc_t* complexVector,
-                                         unsigned int num_points)
+static inline void volk_32fc_magnitude_32f_neon_fancy_sweet(
+    float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
  {
-  unsigned int number;
-  unsigned int quarter_points = num_points / 4;
-  const float* complexVectorPtr = (float*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-
-  const float threshold = 0.4142135;
-
-  float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low;
-  a_high = vdupq_n_f32( 0.84 );
-  b_high = vdupq_n_f32( 0.561);
-  a_low  = vdupq_n_f32( 0.99 );
-  b_low  = vdupq_n_f32( 0.197);
-
-  uint32x4_t comp0, comp1;
-
-  float32x4x2_t complex_vec;
-  float32x4_t min_vec, max_vec, magnitude_vec;
-  float32x4_t real_abs, imag_abs;
-  for(number = 0; number < quarter_points; number++){
-    complex_vec = vld2q_f32(complexVectorPtr);
-
-    real_abs = vabsq_f32(complex_vec.val[0]);
-    imag_abs = vabsq_f32(complex_vec.val[1]);
-
-    min_vec = vminq_f32(real_abs, imag_abs);
-    max_vec = vmaxq_f32(real_abs, imag_abs);
-
-    // effective branch to choose coefficient pair.
-    comp0 = vcgtq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
-    comp1 = vcleq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
-
-    // and 0s or 1s with coefficients from previous effective branch
-    a_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)a_high),
-                                   vandq_s32((int32x4_t)comp1, (int32x4_t)a_low));
-    b_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)b_high),
-                                   vandq_s32((int32x4_t)comp1, (int32x4_t)b_low));
-
-    // coefficients chosen, do the weighted sum
-    min_vec = vmulq_f32(min_vec, b_vec);
-    max_vec = vmulq_f32(max_vec, a_vec);
-
-    magnitude_vec = vaddq_f32(min_vec, max_vec);
-    vst1q_f32(magnitudeVectorPtr, magnitude_vec);
-
-    complexVectorPtr += 8;
-    magnitudeVectorPtr += 4;
-  }
-
-  for(number = quarter_points*4; number < num_points; number++){
-    const float real = *complexVectorPtr++;
-    const float imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
-  }
+    unsigned int number;
+    unsigned int quarter_points = num_points / 4;
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    const float threshold = 0.4142135;
+
+    float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low;
+    a_high = vdupq_n_f32(0.84);
+    b_high = vdupq_n_f32(0.561);
+    a_low = vdupq_n_f32(0.99);
+    b_low = vdupq_n_f32(0.197);
+
+    uint32x4_t comp0, comp1;
+
+    float32x4x2_t complex_vec;
+    float32x4_t min_vec, max_vec, magnitude_vec;
+    float32x4_t real_abs, imag_abs;
+    for (number = 0; number < quarter_points; number++) {
+        complex_vec = vld2q_f32(complexVectorPtr);
+
+        real_abs = vabsq_f32(complex_vec.val[0]);
+        imag_abs = vabsq_f32(complex_vec.val[1]);
+
+        min_vec = vminq_f32(real_abs, imag_abs);
+        max_vec = vmaxq_f32(real_abs, imag_abs);
+
+        // effective branch to choose coefficient pair.
+        comp0 = vcgtq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
+        comp1 = vcleq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
+
+        // and 0s or 1s with coefficients from previous effective branch
+        a_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)a_high),
+                                       vandq_s32((int32x4_t)comp1, (int32x4_t)a_low));
+        b_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)b_high),
+                                       vandq_s32((int32x4_t)comp1, (int32x4_t)b_low));
+
+        // coefficients chosen, do the weighted sum
+        min_vec = vmulq_f32(min_vec, b_vec);
+        max_vec = vmulq_f32(max_vec, a_vec);
+
+        magnitude_vec = vaddq_f32(min_vec, max_vec);
+        vst1q_f32(magnitudeVectorPtr, magnitude_vec);
+
+        complexVectorPtr += 8;
+        magnitudeVectorPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        const float real = *complexVectorPtr++;
+        const float imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
+    }
  }
  #endif /* LV_HAVE_NEON */
  
  
  #ifdef LV_HAVE_ORC
  
-extern void
-volk_32fc_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector,
-                                   unsigned int num_points);
+extern void volk_32fc_magnitude_32f_a_orc_impl(float* magnitudeVector,
+                                               const lv_32fc_t* complexVector,
+                                               unsigned int num_points);
  
-static inline void
-volk_32fc_magnitude_32f_u_orc(float* magnitudeVector, const lv_32fc_t* complexVector,
-                              unsigned int num_points)
+static inline void volk_32fc_magnitude_32f_u_orc(float* magnitudeVector,
+                                                 const lv_32fc_t* complexVector,
+                                                 unsigned int num_points)
  {
-  volk_32fc_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, num_points);
+    volk_32fc_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, num_points);
  }
  #endif /* LV_HAVE_ORC */
  
diff --git a/kernels/volk/volk_32fc_magnitude_squared_32f.h b/kernels/volk/volk_32fc_magnitude_squared_32f.h

index 51bb4dfc62c3a6d6c49c0f1aa4acfbf62910065d..cb093ca76989ea3b535934ab16c5a60c06871621 100644 (file)
--- a/kernels/volk/volk_32fc_magnitude_squared_32f.h
+++ b/kernels/volk/volk_32fc_magnitude_squared_32f.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_magnitude_squared_32f(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_32fc_magnitude_squared_32f(float* magnitudeVector, const lv_32fc_t*
+ * complexVector, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li complexVector: The complex input vector.
@@ -72,41 +72,41 @@
  #define INCLUDED_volk_32fc_magnitude_squared_32f_u_H
  
  #include <inttypes.h>
-#include <stdio.h>
  #include <math.h>
+#include <stdio.h>
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  #include <volk/volk_avx_intrinsics.h>
  
-static inline void
-volk_32fc_magnitude_squared_32f_u_avx(float* magnitudeVector, const lv_32fc_t* complexVector,
-                                      unsigned int num_points)
+static inline void volk_32fc_magnitude_squared_32f_u_avx(float* magnitudeVector,
+                                                         const lv_32fc_t* complexVector,
+                                                         unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  const float* complexVectorPtr = (float*) complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-
-  __m256 cplxValue1, cplxValue2, result;
-
-  for(; number < eighthPoints; number++){
-    cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
-    cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8);
-    result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2);
-    _mm256_storeu_ps(magnitudeVectorPtr, result);
-
-    complexVectorPtr += 16;
-    magnitudeVectorPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    float val1Real = *complexVectorPtr++;
-    float val1Imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
-  }
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m256 cplxValue1, cplxValue2, result;
+
+    for (; number < eighthPoints; number++) {
+        cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+        cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8);
+        result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2);
+        _mm256_storeu_ps(magnitudeVectorPtr, result);
+
+        complexVectorPtr += 16;
+        magnitudeVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
@@ -115,137 +115,136 @@ volk_32fc_magnitude_squared_32f_u_avx(float* magnitudeVector, const lv_32fc_t* c
  #include <pmmintrin.h>
  #include <volk/volk_sse3_intrinsics.h>
  
-static inline void
-volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector,
-                                       unsigned int num_points)
+static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector,
+                                                          const lv_32fc_t* complexVector,
+                                                          unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* complexVectorPtr = (float*) complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-
-  __m128 cplxValue1, cplxValue2, result;
-  for(; number < quarterPoints; number++){
-    cplxValue1 = _mm_loadu_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    cplxValue2 = _mm_loadu_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2);
-    _mm_storeu_ps(magnitudeVectorPtr, result);
-    magnitudeVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    float val1Real = *complexVectorPtr++;
-    float val1Imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
-  }
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, result;
+    for (; number < quarterPoints; number++) {
+        cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2);
+        _mm_storeu_ps(magnitudeVectorPtr, result);
+        magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
  }
  #endif /* LV_HAVE_SSE3 */
  
  
  #ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
  #include <volk/volk_sse_intrinsics.h>
+#include <xmmintrin.h>
  
-static inline void
-volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector,
-                                      unsigned int num_points)
+static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector,
+                                                         const lv_32fc_t* complexVector,
+                                                         unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  const float* complexVectorPtr = (float*) complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
  
-  __m128 cplxValue1, cplxValue2, result;
+    __m128 cplxValue1, cplxValue2, result;
  
-  for(; number < quarterPoints; number++){
-    cplxValue1 = _mm_loadu_ps(complexVectorPtr);
-    complexVectorPtr += 4;
+    for (; number < quarterPoints; number++) {
+        cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 4;
  
-    cplxValue2 = _mm_loadu_ps(complexVectorPtr);
-    complexVectorPtr += 4;
+        cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 4;
  
-    result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2);
-    _mm_storeu_ps(magnitudeVectorPtr, result);
-    magnitudeVectorPtr += 4;
-  }
+        result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2);
+        _mm_storeu_ps(magnitudeVectorPtr, result);
+        magnitudeVectorPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    float val1Real = *complexVectorPtr++;
-    float val1Imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32fc_magnitude_squared_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector,
-                                        unsigned int num_points)
+static inline void volk_32fc_magnitude_squared_32f_generic(float* magnitudeVector,
+                                                           const lv_32fc_t* complexVector,
+                                                           unsigned int num_points)
  {
-  const float* complexVectorPtr = (float*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-  unsigned int number = 0;
-  for(number = 0; number < num_points; number++){
-    const float real = *complexVectorPtr++;
-    const float imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = (real*real) + (imag*imag);
-  }
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+    unsigned int number = 0;
+    for (number = 0; number < num_points; number++) {
+        const float real = *complexVectorPtr++;
+        const float imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = (real * real) + (imag * imag);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
-
  #endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
  #ifndef INCLUDED_volk_32fc_magnitude_squared_32f_a_H
  #define INCLUDED_volk_32fc_magnitude_squared_32f_a_H
  
  #include <inttypes.h>
-#include <stdio.h>
  #include <math.h>
+#include <stdio.h>
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  #include <volk/volk_avx_intrinsics.h>
  
-static inline void
-volk_32fc_magnitude_squared_32f_a_avx(float* magnitudeVector, const lv_32fc_t* complexVector,
-                                      unsigned int num_points)
+static inline void volk_32fc_magnitude_squared_32f_a_avx(float* magnitudeVector,
+                                                         const lv_32fc_t* complexVector,
+                                                         unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  const float* complexVectorPtr = (float*) complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-
-  __m256 cplxValue1, cplxValue2, result;
-  for(; number < eighthPoints; number++){
-    cplxValue1 = _mm256_load_ps(complexVectorPtr);
-    complexVectorPtr += 8;
-
-    cplxValue2 = _mm256_load_ps(complexVectorPtr);
-    complexVectorPtr += 8;
-
-    result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2);
-    _mm256_store_ps(magnitudeVectorPtr, result);
-    magnitudeVectorPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    float val1Real = *complexVectorPtr++;
-    float val1Imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
-  }
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m256 cplxValue1, cplxValue2, result;
+    for (; number < eighthPoints; number++) {
+        cplxValue1 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        cplxValue2 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2);
+        _mm256_store_ps(magnitudeVectorPtr, result);
+        magnitudeVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
@@ -254,72 +253,72 @@ volk_32fc_magnitude_squared_32f_a_avx(float* magnitudeVector, const lv_32fc_t* c
  #include <pmmintrin.h>
  #include <volk/volk_sse3_intrinsics.h>
  
-static inline void
-volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector,
-                                       unsigned int num_points)
+static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector,
+                                                          const lv_32fc_t* complexVector,
+                                                          unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* complexVectorPtr = (float*) complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-
-  __m128 cplxValue1, cplxValue2, result;
-  for(; number < quarterPoints; number++){
-    cplxValue1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    cplxValue2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2);
-    _mm_store_ps(magnitudeVectorPtr, result);
-    magnitudeVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    float val1Real = *complexVectorPtr++;
-    float val1Imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
-  }
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, result;
+    for (; number < quarterPoints; number++) {
+        cplxValue1 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        cplxValue2 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2);
+        _mm_store_ps(magnitudeVectorPtr, result);
+        magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
  }
  #endif /* LV_HAVE_SSE3 */
  
  
  #ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
  #include <volk/volk_sse_intrinsics.h>
+#include <xmmintrin.h>
  
-static inline void
-volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector,
-                                      unsigned int num_points)
+static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector,
+                                                         const lv_32fc_t* complexVector,
+                                                         unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* complexVectorPtr = (float*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-
-  __m128 cplxValue1, cplxValue2, result;
-  for(;number < quarterPoints; number++){
-    cplxValue1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    cplxValue2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2);
-    _mm_store_ps(magnitudeVectorPtr, result);
-    magnitudeVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    float val1Real = *complexVectorPtr++;
-    float val1Imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
-  }
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, result;
+    for (; number < quarterPoints; number++) {
+        cplxValue1 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        cplxValue2 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2);
+        _mm_store_ps(magnitudeVectorPtr, result);
+        magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
  }
  #endif /* LV_HAVE_SSE */
  
@@ -327,55 +326,57 @@ volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* c
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_32fc_magnitude_squared_32f_neon(float* magnitudeVector, const lv_32fc_t* complexVector,
-                                     unsigned int num_points)
+static inline void volk_32fc_magnitude_squared_32f_neon(float* magnitudeVector,
+                                                        const lv_32fc_t* complexVector,
+                                                        unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* complexVectorPtr = (float*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-
-  float32x4x2_t cmplx_val;
-  float32x4_t result;
-  for(;number < quarterPoints; number++){
-    cmplx_val = vld2q_f32(complexVectorPtr);
-    complexVectorPtr += 8;
-
-    cmplx_val.val[0] = vmulq_f32(cmplx_val.val[0], cmplx_val.val[0]); // Square the values
-    cmplx_val.val[1] = vmulq_f32(cmplx_val.val[1], cmplx_val.val[1]); // Square the values
-
-    result = vaddq_f32(cmplx_val.val[0], cmplx_val.val[1]); // Add the I2 and Q2 values
-
-    vst1q_f32(magnitudeVectorPtr, result);
-    magnitudeVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    float val1Real = *complexVectorPtr++;
-    float val1Imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
-  }
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    float32x4x2_t cmplx_val;
+    float32x4_t result;
+    for (; number < quarterPoints; number++) {
+        cmplx_val = vld2q_f32(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        cmplx_val.val[0] =
+            vmulq_f32(cmplx_val.val[0], cmplx_val.val[0]); // Square the values
+        cmplx_val.val[1] =
+            vmulq_f32(cmplx_val.val[1], cmplx_val.val[1]); // Square the values
+
+        result =
+            vaddq_f32(cmplx_val.val[0], cmplx_val.val[1]); // Add the I2 and Q2 values
+
+        vst1q_f32(magnitudeVectorPtr, result);
+        magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
  }
  #endif /* LV_HAVE_NEON */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector,
-                                          unsigned int num_points)
+static inline void volk_32fc_magnitude_squared_32f_a_generic(
+    float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
  {
-  const float* complexVectorPtr = (float*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-  unsigned int number = 0;
-  for(number = 0; number < num_points; number++){
-    const float real = *complexVectorPtr++;
-    const float imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = (real*real) + (imag*imag);
-  }
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+    unsigned int number = 0;
+    for (number = 0; number < num_points; number++) {
+        const float real = *complexVectorPtr++;
+        const float imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = (real * real) + (imag * imag);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
diff --git a/kernels/volk/volk_32fc_s32f_atan2_32f.h b/kernels/volk/volk_32fc_s32f_atan2_32f.h

index c1693368804e77c131726208f5720464ee6b8f22..f08f79384c1fcae3efebc34c8ab12731f400e925 100644 (file)
--- a/kernels/volk/volk_32fc_s32f_atan2_32f.h
+++ b/kernels/volk/volk_32fc_s32f_atan2_32f.h
@@ -30,13 +30,13 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_s32f_atan2_32f(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points)
- * \endcode
+ * void volk_32fc_s32f_atan2_32f(float* outputVector, const lv_32fc_t* complexVector,
+ * const float normalizeFactor, unsigned int num_points) \endcode
   *
   * \b Inputs
- * \li inputVector: The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin).
- * \li normalizeFactor: The atan results are divided by this normalization factor.
- * \li num_points: The number of complex values in \p inputVector.
+ * \li inputVector: The byte-aligned input vector containing interleaved IQ data (I = cos,
+ * Q = sin). \li normalizeFactor: The atan results are divided by this normalization
+ * factor. \li num_points: The number of complex values in \p inputVector.
   *
   * \b Outputs
   * \li outputVector: The vector where the results will be stored.
@@ -75,8 +75,8 @@
  #define INCLUDED_volk_32fc_s32f_atan2_32f_a_H
  
  #include <inttypes.h>
-#include <stdio.h>
  #include <math.h>
+#include <stdio.h>
  
  #ifdef LV_HAVE_SSE4_1
  #include <smmintrin.h>
@@ -85,50 +85,54 @@
  #include <simdmath.h>
  #endif /* LV_HAVE_LIB_SIMDMATH */
  
-static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector,  const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){
-  const float* complexVectorPtr = (float*)complexVector;
-  float* outPtr = outputVector;
+static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector,
+                                                     const lv_32fc_t* complexVector,
+                                                     const float normalizeFactor,
+                                                     unsigned int num_points)
+{
+    const float* complexVectorPtr = (float*)complexVector;
+    float* outPtr = outputVector;
  
-  unsigned int number = 0;
-  const float invNormalizeFactor = 1.0 / normalizeFactor;
+    unsigned int number = 0;
+    const float invNormalizeFactor = 1.0 / normalizeFactor;
  
  #ifdef LV_HAVE_LIB_SIMDMATH
-  const unsigned int quarterPoints = num_points / 4;
-  __m128 testVector = _mm_set_ps1(2*M_PI);
-  __m128 correctVector = _mm_set_ps1(M_PI);
-  __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
-  __m128 phase;
-  __m128 complex1, complex2, iValue, qValue;
-  __m128 keepMask;
-
-  for (; number < quarterPoints; number++) {
-    // Load IQ data:
-    complex1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-    complex2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-    // Deinterleave IQ data:
-    iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0));
-    qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1));
-    // Arctan to get phase:
-    phase = atan2f4(qValue, iValue);
-    // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
-    // Compare to 2pi:
-    keepMask = _mm_cmpneq_ps(phase,testVector);
-    phase = _mm_blendv_ps(correctVector, phase, keepMask);
-    // done with above correction.
-    phase = _mm_mul_ps(phase, vNormalizeFactor);
-    _mm_store_ps((float*)outPtr, phase);
-    outPtr += 4;
-  }
-  number = quarterPoints * 4;
+    const unsigned int quarterPoints = num_points / 4;
+    __m128 testVector = _mm_set_ps1(2 * M_PI);
+    __m128 correctVector = _mm_set_ps1(M_PI);
+    __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
+    __m128 phase;
+    __m128 complex1, complex2, iValue, qValue;
+    __m128 keepMask;
+
+    for (; number < quarterPoints; number++) {
+        // Load IQ data:
+        complex1 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+        complex2 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+        // Deinterleave IQ data:
+        iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2, 0, 2, 0));
+        qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3, 1, 3, 1));
+        // Arctan to get phase:
+        phase = atan2f4(qValue, iValue);
+        // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
+        // Compare to 2pi:
+        keepMask = _mm_cmpneq_ps(phase, testVector);
+        phase = _mm_blendv_ps(correctVector, phase, keepMask);
+        // done with above correction.
+        phase = _mm_mul_ps(phase, vNormalizeFactor);
+        _mm_store_ps((float*)outPtr, phase);
+        outPtr += 4;
+    }
+    number = quarterPoints * 4;
  #endif /* LV_HAVE_SIMDMATH_H */
  
-  for (; number < num_points; number++) {
-    const float real = *complexVectorPtr++;
-    const float imag = *complexVectorPtr++;
-    *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
-  }
+    for (; number < num_points; number++) {
+        const float real = *complexVectorPtr++;
+        const float imag = *complexVectorPtr++;
+        *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+    }
  }
  #endif /* LV_HAVE_SSE4_1 */
  
@@ -140,72 +144,78 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector,  const
  #include <simdmath.h>
  #endif /* LV_HAVE_LIB_SIMDMATH */
  
-static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector,  const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){
-  const float* complexVectorPtr = (float*)complexVector;
-  float* outPtr = outputVector;
+static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector,
+                                                  const lv_32fc_t* complexVector,
+                                                  const float normalizeFactor,
+                                                  unsigned int num_points)
+{
+    const float* complexVectorPtr = (float*)complexVector;
+    float* outPtr = outputVector;
  
-  unsigned int number = 0;
-  const float invNormalizeFactor = 1.0 / normalizeFactor;
+    unsigned int number = 0;
+    const float invNormalizeFactor = 1.0 / normalizeFactor;
  
  #ifdef LV_HAVE_LIB_SIMDMATH
-  const unsigned int quarterPoints = num_points / 4;
-  __m128 testVector = _mm_set_ps1(2*M_PI);
-  __m128 correctVector = _mm_set_ps1(M_PI);
-  __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
-  __m128 phase;
-  __m128 complex1, complex2, iValue, qValue;
-  __m128 mask;
-  __m128 keepMask;
-
-  for (; number < quarterPoints; number++) {
-    // Load IQ data:
-    complex1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-    complex2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-    // Deinterleave IQ data:
-    iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0));
-    qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1));
-    // Arctan to get phase:
-    phase = atan2f4(qValue, iValue);
-    // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
-    // Compare to 2pi:
-    keepMask = _mm_cmpneq_ps(phase,testVector);
-    phase = _mm_and_ps(phase, keepMask);
-    mask = _mm_andnot_ps(keepMask, correctVector);
-    phase = _mm_or_ps(phase, mask);
-    // done with above correction.
-    phase = _mm_mul_ps(phase, vNormalizeFactor);
-    _mm_store_ps((float*)outPtr, phase);
-    outPtr += 4;
-  }
-  number = quarterPoints * 4;
+    const unsigned int quarterPoints = num_points / 4;
+    __m128 testVector = _mm_set_ps1(2 * M_PI);
+    __m128 correctVector = _mm_set_ps1(M_PI);
+    __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
+    __m128 phase;
+    __m128 complex1, complex2, iValue, qValue;
+    __m128 mask;
+    __m128 keepMask;
+
+    for (; number < quarterPoints; number++) {
+        // Load IQ data:
+        complex1 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+        complex2 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+        // Deinterleave IQ data:
+        iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2, 0, 2, 0));
+        qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3, 1, 3, 1));
+        // Arctan to get phase:
+        phase = atan2f4(qValue, iValue);
+        // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
+        // Compare to 2pi:
+        keepMask = _mm_cmpneq_ps(phase, testVector);
+        phase = _mm_and_ps(phase, keepMask);
+        mask = _mm_andnot_ps(keepMask, correctVector);
+        phase = _mm_or_ps(phase, mask);
+        // done with above correction.
+        phase = _mm_mul_ps(phase, vNormalizeFactor);
+        _mm_store_ps((float*)outPtr, phase);
+        outPtr += 4;
+    }
+    number = quarterPoints * 4;
  #endif /* LV_HAVE_SIMDMATH_H */
  
-  for (; number < num_points; number++) {
-    const float real = *complexVectorPtr++;
-    const float imag = *complexVectorPtr++;
-    *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
-  }
+    for (; number < num_points; number++) {
+        const float real = *complexVectorPtr++;
+        const float imag = *complexVectorPtr++;
+        *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){
-  float* outPtr = outputVector;
-  const float* inPtr = (float*)inputVector;
-  const float invNormalizeFactor = 1.0 / normalizeFactor;
-  unsigned int number;
-  for ( number = 0; number < num_points; number++) {
-    const float real = *inPtr++;
-    const float imag = *inPtr++;
-    *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
-  }
+static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector,
+                                                    const lv_32fc_t* inputVector,
+                                                    const float normalizeFactor,
+                                                    unsigned int num_points)
+{
+    float* outPtr = outputVector;
+    const float* inPtr = (float*)inputVector;
+    const float invNormalizeFactor = 1.0 / normalizeFactor;
+    unsigned int number;
+    for (number = 0; number < num_points; number++) {
+        const float real = *inPtr++;
+        const float imag = *inPtr++;
+        *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
-
-
  #endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a_H */
diff --git a/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h b/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h

index 64c6a8b8f10753a99ac828a595aae2e43b0af472..f70f494f03e68256db0a6039e4f59d1ca76923a0 100644 (file)
--- a/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h
+++ b/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_s32f_deinterleave_real_16i(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points)
- * \endcode
+ * void volk_32fc_s32f_deinterleave_real_16i(int16_t* iBuffer, const lv_32fc_t*
+ * complexVector, const float scalar, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li complexVector: The complex input vector.
@@ -73,61 +73,62 @@
  #ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H
  #define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
  #include <stdio.h>
+#include <volk/volk_common.h>
  
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
  static inline void
-volk_32fc_s32f_deinterleave_real_16i_a_avx2(int16_t* iBuffer, const lv_32fc_t* complexVector,
-                                           const float scalar, unsigned int num_points)
+volk_32fc_s32f_deinterleave_real_16i_a_avx2(int16_t* iBuffer,
+                                            const lv_32fc_t* complexVector,
+                                            const float scalar,
+                                            unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  const float* complexVectorPtr = (float*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  __m256 vScalar = _mm256_set1_ps(scalar);
+    const float* complexVectorPtr = (float*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
  
-  __m256 cplxValue1, cplxValue2, iValue;
-  __m256i a;
-  __m128i b;
+    __m256 vScalar = _mm256_set1_ps(scalar);
  
-  __m256i idx = _mm256_set_epi32(3,3,3,3,5,1,4,0);
+    __m256 cplxValue1, cplxValue2, iValue;
+    __m256i a;
+    __m128i b;
  
-  for(;number < eighthPoints; number++){
-    cplxValue1 = _mm256_load_ps(complexVectorPtr);
-    complexVectorPtr += 8;
+    __m256i idx = _mm256_set_epi32(3, 3, 3, 3, 5, 1, 4, 0);
  
-    cplxValue2 = _mm256_load_ps(complexVectorPtr);
-    complexVectorPtr += 8;
+    for (; number < eighthPoints; number++) {
+        cplxValue1 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
  
-    // Arrange in i1i2i3i4 format
-    iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+        cplxValue2 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
  
-    iValue = _mm256_mul_ps(iValue, vScalar);
+        // Arrange in i1i2i3i4 format
+        iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
  
-    iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO);
-    a = _mm256_cvtps_epi32(iValue);
-    a = _mm256_packs_epi32(a,a);
-    a = _mm256_permutevar8x32_epi32(a,idx);
-    b = _mm256_extracti128_si256(a,0);
+        iValue = _mm256_mul_ps(iValue, vScalar);
  
-    _mm_store_si128((__m128i*)iBufferPtr,b); 
-    iBufferPtr += 8;
+        iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO);
+        a = _mm256_cvtps_epi32(iValue);
+        a = _mm256_packs_epi32(a, a);
+        a = _mm256_permutevar8x32_epi32(a, idx);
+        b = _mm256_extracti128_si256(a, 0);
  
-  }
+        _mm_store_si128((__m128i*)iBufferPtr, b);
+        iBufferPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  iBufferPtr = &iBuffer[number];
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
-    complexVectorPtr++;
-  }
+    number = eighthPoints * 8;
+    iBufferPtr = &iBuffer[number];
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+        complexVectorPtr++;
+    }
  }
  
  
@@ -137,46 +138,48 @@ volk_32fc_s32f_deinterleave_real_16i_a_avx2(int16_t* iBuffer, const lv_32fc_t* c
  #include <xmmintrin.h>
  
  static inline void
-volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer, const lv_32fc_t* complexVector,
-                                           const float scalar, unsigned int num_points)
+volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer,
+                                           const lv_32fc_t* complexVector,
+                                           const float scalar,
+                                           unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  const float* complexVectorPtr = (float*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
+    const float* complexVectorPtr = (float*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
  
-  __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 vScalar = _mm_set_ps1(scalar);
  
-  __m128 cplxValue1, cplxValue2, iValue;
+    __m128 cplxValue1, cplxValue2, iValue;
  
-  __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+    __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
  
-  for(;number < quarterPoints; number++){
-    cplxValue1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
+    for (; number < quarterPoints; number++) {
+        cplxValue1 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
  
-    cplxValue2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
+        cplxValue2 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
  
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+        // Arrange in i1i2i3i4 format
+        iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
  
-    iValue = _mm_mul_ps(iValue, vScalar);
+        iValue = _mm_mul_ps(iValue, vScalar);
  
-    _mm_store_ps(floatBuffer, iValue);
-    *iBufferPtr++ = (int16_t)(floatBuffer[0]);
-    *iBufferPtr++ = (int16_t)(floatBuffer[1]);
-    *iBufferPtr++ = (int16_t)(floatBuffer[2]);
-    *iBufferPtr++ = (int16_t)(floatBuffer[3]);
-  }
+        _mm_store_ps(floatBuffer, iValue);
+        *iBufferPtr++ = (int16_t)(floatBuffer[0]);
+        *iBufferPtr++ = (int16_t)(floatBuffer[1]);
+        *iBufferPtr++ = (int16_t)(floatBuffer[2]);
+        *iBufferPtr++ = (int16_t)(floatBuffer[3]);
+    }
  
-  number = quarterPoints * 4;
-  iBufferPtr = &iBuffer[number];
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
-    complexVectorPtr++;
-  }
+    number = quarterPoints * 4;
+    iBufferPtr = &iBuffer[number];
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+        complexVectorPtr++;
+    }
  }
  
  #endif /* LV_HAVE_SSE */
@@ -185,16 +188,18 @@ volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer, const lv_32fc_t* co
  #ifdef LV_HAVE_GENERIC
  
  static inline void
-volk_32fc_s32f_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_32fc_t* complexVector,
-                                             const float scalar, unsigned int num_points)
+volk_32fc_s32f_deinterleave_real_16i_generic(int16_t* iBuffer,
+                                             const lv_32fc_t* complexVector,
+                                             const float scalar,
+                                             unsigned int num_points)
  {
-  const float* complexVectorPtr = (float*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  unsigned int number = 0;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
-    complexVectorPtr++;
-  }
+    const float* complexVectorPtr = (float*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    unsigned int number = 0;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+        complexVectorPtr++;
+    }
  }
  
  #endif /* LV_HAVE_GENERIC */
@@ -204,60 +209,61 @@ volk_32fc_s32f_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_32fc_t*
  #ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H
  #define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
  #include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
  static inline void
-volk_32fc_s32f_deinterleave_real_16i_u_avx2(int16_t* iBuffer, const lv_32fc_t* complexVector,
-                                           const float scalar, unsigned int num_points)
+volk_32fc_s32f_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
+                                            const lv_32fc_t* complexVector,
+                                            const float scalar,
+                                            unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-
-  const float* complexVectorPtr = (float*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  __m256 vScalar = _mm256_set1_ps(scalar);
+    const float* complexVectorPtr = (float*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
  
-  __m256 cplxValue1, cplxValue2, iValue;
-  __m256i a;
-  __m128i b;
+    __m256 vScalar = _mm256_set1_ps(scalar);
  
-  __m256i idx = _mm256_set_epi32(3,3,3,3,5,1,4,0);
+    __m256 cplxValue1, cplxValue2, iValue;
+    __m256i a;
+    __m128i b;
  
-  for(;number < eighthPoints; number++){
-    cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
-    complexVectorPtr += 8;
+    __m256i idx = _mm256_set_epi32(3, 3, 3, 3, 5, 1, 4, 0);
  
-    cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
-    complexVectorPtr += 8;
+    for (; number < eighthPoints; number++) {
+        cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 8;
  
-    // Arrange in i1i2i3i4 format
-    iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+        cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 8;
  
-    iValue = _mm256_mul_ps(iValue, vScalar);
+        // Arrange in i1i2i3i4 format
+        iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
  
-    iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO);
-    a = _mm256_cvtps_epi32(iValue);
-    a = _mm256_packs_epi32(a,a);
-    a = _mm256_permutevar8x32_epi32(a,idx);
-    b = _mm256_extracti128_si256(a,0);
+        iValue = _mm256_mul_ps(iValue, vScalar);
  
-    _mm_storeu_si128((__m128i*)iBufferPtr,b); 
-    iBufferPtr += 8;
+        iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO);
+        a = _mm256_cvtps_epi32(iValue);
+        a = _mm256_packs_epi32(a, a);
+        a = _mm256_permutevar8x32_epi32(a, idx);
+        b = _mm256_extracti128_si256(a, 0);
  
-  }
+        _mm_storeu_si128((__m128i*)iBufferPtr, b);
+        iBufferPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  iBufferPtr = &iBuffer[number];
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
-    complexVectorPtr++;
-  }
+    number = eighthPoints * 8;
+    iBufferPtr = &iBuffer[number];
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+        complexVectorPtr++;
+    }
  }
  
  #endif /* LV_HAVE_AVX2 */
diff --git a/kernels/volk/volk_32fc_s32f_magnitude_16i.h b/kernels/volk/volk_32fc_s32f_magnitude_16i.h

index 6e7e7cbcc762cf5105fb0fb80cc5a4cf354a6348..91a5b8e4cdf844b0d0f31d742f89dc7c4033b5ea 100644 (file)
--- a/kernels/volk/volk_32fc_s32f_magnitude_16i.h
+++ b/kernels/volk/volk_32fc_s32f_magnitude_16i.h
@@ -31,8 +31,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_s32f_magnitude_16i(int16_t* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_32fc_s32f_magnitude_16i(int16_t* magnitudeVector, const lv_32fc_t*
+ * complexVector, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li complexVector: The complex input vector.
@@ -73,123 +73,129 @@
  #ifdef LV_HAVE_GENERIC
  #include <volk/volk_common.h>
  
-static inline void
-volk_32fc_s32f_magnitude_16i_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
-                                     const float scalar, unsigned int num_points)
+static inline void volk_32fc_s32f_magnitude_16i_generic(int16_t* magnitudeVector,
+                                                        const lv_32fc_t* complexVector,
+                                                        const float scalar,
+                                                        unsigned int num_points)
  {
-  const float* complexVectorPtr = (float*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
-  unsigned int number = 0;
-  for(number = 0; number < num_points; number++){
-    __VOLK_VOLATILE float real = *complexVectorPtr++;
-    __VOLK_VOLATILE float imag = *complexVectorPtr++;
-    real *= real;
-    imag *= imag;
-    *magnitudeVectorPtr++ = (int16_t)rintf(scalar*sqrtf(real + imag));
-  }
+    const float* complexVectorPtr = (float*)complexVector;
+    int16_t* magnitudeVectorPtr = magnitudeVector;
+    unsigned int number = 0;
+    for (number = 0; number < num_points; number++) {
+        __VOLK_VOLATILE float real = *complexVectorPtr++;
+        __VOLK_VOLATILE float imag = *complexVectorPtr++;
+        real *= real;
+        imag *= imag;
+        *magnitudeVectorPtr++ = (int16_t)rintf(scalar * sqrtf(real + imag));
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  #ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
  #define INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
-#include <stdio.h>
  #include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_32fc_s32f_magnitude_16i_a_avx2(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
-                                    const float scalar, unsigned int num_points)
+static inline void volk_32fc_s32f_magnitude_16i_a_avx2(int16_t* magnitudeVector,
+                                                       const lv_32fc_t* complexVector,
+                                                       const float scalar,
+                                                       unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  const float* complexVectorPtr = (const float*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
+    const float* complexVectorPtr = (const float*)complexVector;
+    int16_t* magnitudeVectorPtr = magnitudeVector;
  
-  __m256 vScalar = _mm256_set1_ps(scalar);
-  __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0);
-  __m256 cplxValue1, cplxValue2, result;
-  __m256i resultInt;
-  __m128i resultShort;
+    __m256 vScalar = _mm256_set1_ps(scalar);
+    __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
+    __m256 cplxValue1, cplxValue2, result;
+    __m256i resultInt;
+    __m128i resultShort;
  
-  for(;number < eighthPoints; number++){
-    cplxValue1 = _mm256_load_ps(complexVectorPtr);
-    complexVectorPtr += 8;
+    for (; number < eighthPoints; number++) {
+        cplxValue1 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
  
-    cplxValue2 = _mm256_load_ps(complexVectorPtr);
-    complexVectorPtr += 8;
+        cplxValue2 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
  
-    cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
-    cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+        cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+        cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
  
-    result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+        result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
  
-    result = _mm256_sqrt_ps(result);
+        result = _mm256_sqrt_ps(result);
  
-    result = _mm256_mul_ps(result, vScalar);
+        result = _mm256_mul_ps(result, vScalar);
  
-    resultInt = _mm256_cvtps_epi32(result);
-    resultInt = _mm256_packs_epi32(resultInt, resultInt);
-    resultInt = _mm256_permutevar8x32_epi32(resultInt, idx); //permute to compensate for shuffling in hadd and packs
-    resultShort = _mm256_extracti128_si256(resultInt,0);
-    _mm_store_si128((__m128i*)magnitudeVectorPtr,resultShort);
-    magnitudeVectorPtr += 8;
-  }
+        resultInt = _mm256_cvtps_epi32(result);
+        resultInt = _mm256_packs_epi32(resultInt, resultInt);
+        resultInt = _mm256_permutevar8x32_epi32(
+            resultInt, idx); // permute to compensate for shuffling in hadd and packs
+        resultShort = _mm256_extracti128_si256(resultInt, 0);
+        _mm_store_si128((__m128i*)magnitudeVectorPtr, resultShort);
+        magnitudeVectorPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  volk_32fc_s32f_magnitude_16i_generic(magnitudeVector+number, complexVector+number, scalar, num_points-number);
+    number = eighthPoints * 8;
+    volk_32fc_s32f_magnitude_16i_generic(
+        magnitudeVector + number, complexVector + number, scalar, num_points - number);
  }
  #endif /* LV_HAVE_AVX2 */
  
  #ifdef LV_HAVE_SSE3
  #include <pmmintrin.h>
  
-static inline void
-volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
-                                    const float scalar, unsigned int num_points)
+static inline void volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector,
+                                                       const lv_32fc_t* complexVector,
+                                                       const float scalar,
+                                                       unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  const float* complexVectorPtr = (const float*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
+    const float* complexVectorPtr = (const float*)complexVector;
+    int16_t* magnitudeVectorPtr = magnitudeVector;
  
-  __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 vScalar = _mm_set_ps1(scalar);
  
-  __m128 cplxValue1, cplxValue2, result;
+    __m128 cplxValue1, cplxValue2, result;
  
-  __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+    __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
  
-  for(;number < quarterPoints; number++){
-    cplxValue1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
+    for (; number < quarterPoints; number++) {
+        cplxValue1 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
  
-    cplxValue2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
+        cplxValue2 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
  
-    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
-    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+        cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+        cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
  
-    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+        result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
  
-    result = _mm_sqrt_ps(result);
+        result = _mm_sqrt_ps(result);
  
-    result = _mm_mul_ps(result, vScalar);
+        result = _mm_mul_ps(result, vScalar);
  
-    _mm_store_ps(floatBuffer, result);
-    *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
-    *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
-    *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
-    *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
-  }
+        _mm_store_ps(floatBuffer, result);
+        *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
+        *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
+        *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
+        *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
+    }
  
-  number = quarterPoints * 4;
-  volk_32fc_s32f_magnitude_16i_generic(magnitudeVector+number, complexVector+number, scalar, num_points-number);
+    number = quarterPoints * 4;
+    volk_32fc_s32f_magnitude_16i_generic(
+        magnitudeVector + number, complexVector + number, scalar, num_points - number);
  }
  #endif /* LV_HAVE_SSE3 */
  
@@ -197,53 +203,57 @@ volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_32fc_t* c
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
-                                   const float scalar, unsigned int num_points)
+static inline void volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector,
+                                                      const lv_32fc_t* complexVector,
+                                                      const float scalar,
+                                                      unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  const float* complexVectorPtr = (const float*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
+    const float* complexVectorPtr = (const float*)complexVector;
+    int16_t* magnitudeVectorPtr = magnitudeVector;
  
-  __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 vScalar = _mm_set_ps1(scalar);
  
-  __m128 cplxValue1, cplxValue2, result;
-  __m128 iValue, qValue;
+    __m128 cplxValue1, cplxValue2, result;
+    __m128 iValue, qValue;
  
-  __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+    __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
  
-  for(;number < quarterPoints; number++){
-    cplxValue1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
+    for (; number < quarterPoints; number++) {
+        cplxValue1 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
  
-    cplxValue2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
+        cplxValue2 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
  
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-    // Arrange in q1q2q3q4 format
-    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+        // Arrange in i1i2i3i4 format
+        iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+        // Arrange in q1q2q3q4 format
+        qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
  
-    __VOLK_VOLATILE __m128 iValue2 = _mm_mul_ps(iValue, iValue); // Square the I values
-    __VOLK_VOLATILE __m128 qValue2 = _mm_mul_ps(qValue, qValue); // Square the Q Values
+        __VOLK_VOLATILE __m128 iValue2 =
+            _mm_mul_ps(iValue, iValue); // Square the I values
+        __VOLK_VOLATILE __m128 qValue2 =
+            _mm_mul_ps(qValue, qValue); // Square the Q Values
  
-    result = _mm_add_ps(iValue2, qValue2); // Add the I2 and Q2 values
+        result = _mm_add_ps(iValue2, qValue2); // Add the I2 and Q2 values
  
-    result = _mm_sqrt_ps(result);
+        result = _mm_sqrt_ps(result);
  
-    result = _mm_mul_ps(result, vScalar);
+        result = _mm_mul_ps(result, vScalar);
  
-    _mm_store_ps(floatBuffer, result);
-    *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
-    *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
-    *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
-    *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
-  }
+        _mm_store_ps(floatBuffer, result);
+        *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
+        *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
+        *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
+        *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
+    }
  
-  number = quarterPoints * 4;
-  volk_32fc_s32f_magnitude_16i_generic(magnitudeVector+number, complexVector+number, scalar, num_points-number);
+    number = quarterPoints * 4;
+    volk_32fc_s32f_magnitude_16i_generic(
+        magnitudeVector + number, complexVector + number, scalar, num_points - number);
  }
  #endif /* LV_HAVE_SSE */
  
@@ -253,56 +263,59 @@ volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_32fc_t* co
  #ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_u_H
  #define INCLUDED_volk_32fc_s32f_magnitude_16i_u_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
-#include <stdio.h>
  #include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_32fc_s32f_magnitude_16i_u_avx2(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
-                                    const float scalar, unsigned int num_points)
+static inline void volk_32fc_s32f_magnitude_16i_u_avx2(int16_t* magnitudeVector,
+                                                       const lv_32fc_t* complexVector,
+                                                       const float scalar,
+                                                       unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
  
-  const float* complexVectorPtr = (const float*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
+    const float* complexVectorPtr = (const float*)complexVector;
+    int16_t* magnitudeVectorPtr = magnitudeVector;
  
-  __m256 vScalar = _mm256_set1_ps(scalar);
-  __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0);
-  __m256 cplxValue1, cplxValue2, result;
-  __m256i resultInt;
-  __m128i resultShort;
+    __m256 vScalar = _mm256_set1_ps(scalar);
+    __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
+    __m256 cplxValue1, cplxValue2, result;
+    __m256i resultInt;
+    __m128i resultShort;
  
-  for(;number < eighthPoints; number++){
-    cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
-    complexVectorPtr += 8;
+    for (; number < eighthPoints; number++) {
+        cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 8;
  
-    cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
-    complexVectorPtr += 8;
+        cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 8;
  
-    cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
-    cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+        cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+        cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
  
-    result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+        result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
  
-    result = _mm256_sqrt_ps(result);
+        result = _mm256_sqrt_ps(result);
  
-    result = _mm256_mul_ps(result, vScalar);
+        result = _mm256_mul_ps(result, vScalar);
  
-    resultInt = _mm256_cvtps_epi32(result);
-    resultInt = _mm256_packs_epi32(resultInt, resultInt);
-    resultInt = _mm256_permutevar8x32_epi32(resultInt, idx); //permute to compensate for shuffling in hadd and packs
-    resultShort = _mm256_extracti128_si256(resultInt,0);
-    _mm_storeu_si128((__m128i*)magnitudeVectorPtr,resultShort);
-    magnitudeVectorPtr += 8;
-  }
+        resultInt = _mm256_cvtps_epi32(result);
+        resultInt = _mm256_packs_epi32(resultInt, resultInt);
+        resultInt = _mm256_permutevar8x32_epi32(
+            resultInt, idx); // permute to compensate for shuffling in hadd and packs
+        resultShort = _mm256_extracti128_si256(resultInt, 0);
+        _mm_storeu_si128((__m128i*)magnitudeVectorPtr, resultShort);
+        magnitudeVectorPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  volk_32fc_s32f_magnitude_16i_generic(magnitudeVector+number, complexVector+number, scalar, num_points-number);
+    number = eighthPoints * 8;
+    volk_32fc_s32f_magnitude_16i_generic(
+        magnitudeVector + number, complexVector + number, scalar, num_points - number);
  }
  #endif /* LV_HAVE_AVX2 */
  
diff --git a/kernels/volk/volk_32fc_s32f_power_32fc.h b/kernels/volk/volk_32fc_s32f_power_32fc.h

index d2803f2cca03ebab92c68c496621328b527d41d6..b31179c6213377564f785d82670215f312301f83 100644 (file)
--- a/kernels/volk/volk_32fc_s32f_power_32fc.h
+++ b/kernels/volk/volk_32fc_s32f_power_32fc.h
@@ -31,8 +31,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_s32f_power_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points)
- * \endcode
+ * void volk_32fc_s32f_power_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
+ * float power, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li aVector: The complex input vector.
@@ -56,15 +56,17 @@
  #define INCLUDED_volk_32fc_s32f_power_32fc_a_H
  
  #include <inttypes.h>
-#include <stdio.h>
  #include <math.h>
+#include <stdio.h>
  
  //! raise a complex float to a real float power
-static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp, const float power)
+static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp,
+                                                        const float power)
  {
-  const float arg = power*atan2f(lv_creal(exp), lv_cimag(exp));
-  const float mag = powf(lv_creal(exp)*lv_creal(exp) + lv_cimag(exp)*lv_cimag(exp), power/2);
-  return mag*lv_cmake(-cosf(arg), sinf(arg));
+    const float arg = power * atan2f(lv_creal(exp), lv_cimag(exp));
+    const float mag =
+        powf(lv_creal(exp) * lv_creal(exp) + lv_cimag(exp) * lv_cimag(exp), power / 2);
+    return mag * lv_cmake(-cosf(arg), sinf(arg));
  }
  
  #ifdef LV_HAVE_SSE
@@ -74,83 +76,94 @@ static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp, con
  #include <simdmath.h>
  #endif /* LV_HAVE_LIB_SIMDMATH */
  
-static inline void
-volk_32fc_s32f_power_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                const float power, unsigned int num_points)
+static inline void volk_32fc_s32f_power_32fc_a_sse(lv_32fc_t* cVector,
+                                                   const lv_32fc_t* aVector,
+                                                   const float power,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
+    unsigned int number = 0;
  
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
  
  #ifdef LV_HAVE_LIB_SIMDMATH
-  const unsigned int quarterPoints = num_points / 4;
-  __m128 vPower = _mm_set_ps1(power);
+    const unsigned int quarterPoints = num_points / 4;
+    __m128 vPower = _mm_set_ps1(power);
  
-  __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue;
-  for(;number < quarterPoints; number++){
+    __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue;
+    for (; number < quarterPoints; number++) {
  
-    cplxValue1 = _mm_load_ps((float*)aPtr);
-    aPtr += 2;
+        cplxValue1 = _mm_load_ps((float*)aPtr);
+        aPtr += 2;
  
-    cplxValue2 = _mm_load_ps((float*)aPtr);
-    aPtr += 2;
+        cplxValue2 = _mm_load_ps((float*)aPtr);
+        aPtr += 2;
  
-    // Convert to polar coordinates
+        // Convert to polar coordinates
  
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-    // Arrange in q1q2q3q4 format
-    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+        // Arrange in i1i2i3i4 format
+        iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+        // Arrange in q1q2q3q4 format
+        qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
  
-    phase = atan2f4(qValue, iValue); // Calculate the Phase
+        phase = atan2f4(qValue, iValue); // Calculate the Phase
  
-    magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values
+        magnitude = _mm_sqrt_ps(
+            _mm_add_ps(_mm_mul_ps(iValue, iValue),
+                       _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square
+                                                     // rooting the added I2 and Q2 values
  
-    // Now calculate the power of the polar coordinate data
-    magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power
+        // Now calculate the power of the polar coordinate data
+        magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power
  
-    phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power
+        phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power
  
-    // Convert back to cartesian coordinates
-    iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude
-    qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude
+        // Convert back to cartesian coordinates
+        iValue = _mm_mul_ps(cosf4(phase),
+                            magnitude); // Multiply the cos of the phase by the magnitude
+        qValue = _mm_mul_ps(sinf4(phase),
+                            magnitude); // Multiply the sin of the phase by the magnitude
  
-    cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values
-    cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values
+        cplxValue1 =
+            _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values
+        cplxValue2 =
+            _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values
  
-    _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container
+        _mm_store_ps((float*)cPtr,
+                     cplxValue1); // Store the results back into the C container
  
-    cPtr += 2;
+        cPtr += 2;
  
-    _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container
+        _mm_store_ps((float*)cPtr,
+                     cplxValue2); // Store the results back into the C container
  
-    cPtr += 2;
-  }
+        cPtr += 2;
+    }
  
-  number = quarterPoints * 4;
+    number = quarterPoints * 4;
  #endif /* LV_HAVE_LIB_SIMDMATH */
  
-  for(;number < num_points; number++){
-    *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power);
-  }
+    for (; number < num_points; number++) {
+        *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power);
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32fc_s32f_power_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                  const float power, unsigned int num_points)
+static inline void volk_32fc_s32f_power_32fc_generic(lv_32fc_t* cVector,
+                                                     const lv_32fc_t* aVector,
+                                                     const float power,
+                                                     unsigned int num_points)
  {
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  unsigned int number = 0;
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    unsigned int number = 0;
  
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power);
-  }
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power);
+    }
  }
  
  #endif /* LV_HAVE_GENERIC */
diff --git a/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h b/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h

index abe4662bf226ec0bdaf0bb9ec205f5f38cad8d84..a1a036d59bd7a99890cb3e337d4563acf88e02fc 100644 (file)
--- a/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h
+++ b/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h
@@ -29,13 +29,13 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_s32f_power_spectrum_32f(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points)
- * \endcode
+ * void volk_32fc_s32f_power_spectrum_32f(float* logPowerOutput, const lv_32fc_t*
+ * complexFFTInput, const float normalizationFactor, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li complexFFTInput The complex data output from the FFT point.
- * \li normalizationFactor: This value is divided against all the input values before the power is calculated.
- * \li num_points: The number of fft data points.
+ * \li normalizationFactor: This value is divided against all the input values before the
+ * power is calculated. \li num_points: The number of fft data points.
   *
   * \b Outputs
   * \li logPowerOutput: The 10.0 * log10(r*r + i*i) for each data point.
@@ -54,8 +54,8 @@
  #define INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H
  
  #include <inttypes.h>
-#include <stdio.h>
  #include <math.h>
+#include <stdio.h>
  
  #ifdef LV_HAVE_SSE3
  #include <pmmintrin.h>
@@ -65,74 +65,75 @@
  #endif /* LV_HAVE_LIB_SIMDMATH */
  
  static inline void
-volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput,
-                                         const float normalizationFactor, unsigned int num_points)
+volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutput,
+                                         const lv_32fc_t* complexFFTInput,
+                                         const float normalizationFactor,
+                                         unsigned int num_points)
  {
-  const float* inputPtr = (const float*)complexFFTInput;
-  float* destPtr = logPowerOutput;
-  uint64_t number = 0;
-  const float iNormalizationFactor = 1.0 / normalizationFactor;
+    const float* inputPtr = (const float*)complexFFTInput;
+    float* destPtr = logPowerOutput;
+    uint64_t number = 0;
+    const float iNormalizationFactor = 1.0 / normalizationFactor;
  #ifdef LV_HAVE_LIB_SIMDMATH
-  __m128 magScalar = _mm_set_ps1(10.0);
-  magScalar = _mm_div_ps(magScalar, logf4(magScalar));
+    __m128 magScalar = _mm_set_ps1(10.0);
+    magScalar = _mm_div_ps(magScalar, logf4(magScalar));
  
-  __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
+    __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
  
-  __m128 power;
-  __m128 input1, input2;
-  const uint64_t quarterPoints = num_points / 4;
-  for(;number < quarterPoints; number++){
-    // Load the complex values
-    input1 =_mm_load_ps(inputPtr);
-    inputPtr += 4;
-    input2 =_mm_load_ps(inputPtr);
-    inputPtr += 4;
+    __m128 power;
+    __m128 input1, input2;
+    const uint64_t quarterPoints = num_points / 4;
+    for (; number < quarterPoints; number++) {
+        // Load the complex values
+        input1 = _mm_load_ps(inputPtr);
+        inputPtr += 4;
+        input2 = _mm_load_ps(inputPtr);
+        inputPtr += 4;
  
-    // Apply the normalization factor
-    input1 = _mm_mul_ps(input1, invNormalizationFactor);
-    input2 = _mm_mul_ps(input2, invNormalizationFactor);
+        // Apply the normalization factor
+        input1 = _mm_mul_ps(input1, invNormalizationFactor);
+        input2 = _mm_mul_ps(input2, invNormalizationFactor);
  
-    // Multiply each value by itself
-    // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
-    input1 = _mm_mul_ps(input1, input1);
-    // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
-    input2 = _mm_mul_ps(input2, input2);
+        // Multiply each value by itself
+        // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+        input1 = _mm_mul_ps(input1, input1);
+        // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+        input2 = _mm_mul_ps(input2, input2);
  
-    // Horizontal add, to add (r*r) + (i*i) for each complex value
-    // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
-    power = _mm_hadd_ps(input1, input2);
+        // Horizontal add, to add (r*r) + (i*i) for each complex value
+        // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+        power = _mm_hadd_ps(input1, input2);
  
-    // Calculate the natural log power
-    power = logf4(power);
+        // Calculate the natural log power
+        power = logf4(power);
  
-    // Convert to log10 and multiply by 10.0
-    power = _mm_mul_ps(power, magScalar);
+        // Convert to log10 and multiply by 10.0
+        power = _mm_mul_ps(power, magScalar);
  
-    // Store the floating point results
-    _mm_store_ps(destPtr, power);
+        // Store the floating point results
+        _mm_store_ps(destPtr, power);
  
-    destPtr += 4;
-  }
+        destPtr += 4;
+    }
  
-  number = quarterPoints*4;
+    number = quarterPoints * 4;
  #endif /* LV_HAVE_LIB_SIMDMATH */
-  // Calculate the FFT for any remaining points
-
-  for(; number < num_points; number++){
-    // Calculate dBm
-    // 50 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
-    // 75 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+    // Calculate the FFT for any remaining points
  
-    const float real = *inputPtr++ * iNormalizationFactor;
-    const float imag = *inputPtr++ * iNormalizationFactor;
+    for (; number < num_points; number++) {
+        // Calculate dBm
+        // 50 ohm load assumption
+        // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+        // 75 ohm load assumption
+        // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
  
-    *destPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20);
+        const float real = *inputPtr++ * iNormalizationFactor;
+        const float imag = *inputPtr++ * iNormalizationFactor;
  
-    destPtr++;
-  }
+        *destPtr = 10.0 * log10f(((real * real) + (imag * imag)) + 1e-20);
  
+        destPtr++;
+    }
  }
  #endif /* LV_HAVE_SSE3 */
  
@@ -141,7 +142,10 @@ volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutput, const lv_32fc_t*
  #include <volk/volk_neon_intrinsics.h>
  
  static inline void
-volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points)
+volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput,
+                                       const lv_32fc_t* complexFFTInput,
+                                       const float normalizationFactor,
+                                       unsigned int num_points)
  {
      float* logPowerOutputPtr = logPowerOutput;
      const lv_32fc_t* complexFFTInputPtr = complexFFTInput;
@@ -151,14 +155,14 @@ volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput, const lv_32fc_t* c
      float32x4x2_t fft_vec;
      float32x4_t log_pwr_vec;
      float32x4_t mag_squared_vec;
-    
+
      const float inv_ln10_10 = 4.34294481903f; // 10.0/ln(10.)
-    
-    for(number = 0; number < quarter_points; number++) {
+
+    for (number = 0; number < quarter_points; number++) {
          // Load
          fft_vec = vld2q_f32((float*)complexFFTInputPtr);
          // Prefetch next 4
-        __VOLK_PREFETCH(complexFFTInputPtr+4);
+        __VOLK_PREFETCH(complexFFTInputPtr + 4);
          // Normalize
          fft_vec.val[0] = vmulq_n_f32(fft_vec.val[0], iNormalizationFactor);
          fft_vec.val[1] = vmulq_n_f32(fft_vec.val[1], iNormalizationFactor);
@@ -167,12 +171,12 @@ volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput, const lv_32fc_t* c
          // Store
          vst1q_f32(logPowerOutputPtr, log_pwr_vec);
          // Move pointers ahead
-        complexFFTInputPtr+=4;
-        logPowerOutputPtr+=4;
+        complexFFTInputPtr += 4;
+        logPowerOutputPtr += 4;
      }
-    
+
      // deal with the rest
-    for(number = quarter_points * 4; number < num_points; number++) {
+    for (number = quarter_points * 4; number < num_points; number++) {
          const float real = lv_creal(*complexFFTInputPtr) * iNormalizationFactor;
          const float imag = lv_cimag(*complexFFTInputPtr) * iNormalizationFactor;
          *logPowerOutputPtr = 10.0 * log10f(((real * real) + (imag * imag)) + 1e-20);
@@ -186,27 +190,29 @@ volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput, const lv_32fc_t* c
  #ifdef LV_HAVE_GENERIC
  
  static inline void
-volk_32fc_s32f_power_spectrum_32f_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput,
-                                          const float normalizationFactor, unsigned int num_points)
+volk_32fc_s32f_power_spectrum_32f_generic(float* logPowerOutput,
+                                          const lv_32fc_t* complexFFTInput,
+                                          const float normalizationFactor,
+                                          unsigned int num_points)
  {
-  // Calculate the Power of the complex point
-  const float* inputPtr = (float*)complexFFTInput;
-  float* realFFTDataPointsPtr = logPowerOutput;
-  const float iNormalizationFactor = 1.0 / normalizationFactor;
-  unsigned int point;
-  for(point = 0; point < num_points; point++){
-    // Calculate dBm
-    // 50 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
-    // 75 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
-
-    const float real = *inputPtr++ * iNormalizationFactor;
-    const float imag = *inputPtr++ * iNormalizationFactor;
-
-    *realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20);
-    realFFTDataPointsPtr++;
-  }
+    // Calculate the Power of the complex point
+    const float* inputPtr = (float*)complexFFTInput;
+    float* realFFTDataPointsPtr = logPowerOutput;
+    const float iNormalizationFactor = 1.0 / normalizationFactor;
+    unsigned int point;
+    for (point = 0; point < num_points; point++) {
+        // Calculate dBm
+        // 50 ohm load assumption
+        // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+        // 75 ohm load assumption
+        // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+        const float real = *inputPtr++ * iNormalizationFactor;
+        const float imag = *inputPtr++ * iNormalizationFactor;
+
+        *realFFTDataPointsPtr = 10.0 * log10f(((real * real) + (imag * imag)) + 1e-20);
+        realFFTDataPointsPtr++;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
diff --git a/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h b/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h

index 3260b08072c488ef8ac98900d7a6f9a195347786..37ca43cde8b7c672c0165d419b165dba97ddf809 100644 (file)
--- a/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h
+++ b/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h
@@ -29,14 +29,15 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_s32f_x2_power_spectral_density_32f(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points)
- * \endcode
+ * void volk_32fc_s32f_x2_power_spectral_density_32f(float* logPowerOutput, const
+ * lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned
+ * int num_points) \endcode
   *
   * \b Inputs
   * \li complexFFTInput The complex data output from the FFT point.
- * \li normalizationFactor: This value is divided against all the input values before the power is calculated.
- * \li rbw: The resolution bandwidth of the fft spectrum
- * \li num_points: The number of fft data points.
+ * \li normalizationFactor: This value is divided against all the input values before the
+ * power is calculated. \li rbw: The resolution bandwidth of the fft spectrum \li
+ * num_points: The number of fft data points.
   *
   * \b Outputs
   * \li logPowerOutput: The 10.0 * log10((r*r + i*i)/RBW) for each data point.
@@ -55,8 +56,8 @@
  #define INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H
  
  #include <inttypes.h>
-#include <stdio.h>
  #include <math.h>
+#include <stdio.h>
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
@@ -66,83 +67,84 @@
  #endif /* LV_HAVE_LIB_SIMDMATH */
  
  static inline void
-volk_32fc_s32f_x2_power_spectral_density_32f_a_avx(float* logPowerOutput, const lv_32fc_t* complexFFTInput,
-                                                   const float normalizationFactor, const float rbw,
+volk_32fc_s32f_x2_power_spectral_density_32f_a_avx(float* logPowerOutput,
+                                                   const lv_32fc_t* complexFFTInput,
+                                                   const float normalizationFactor,
+                                                   const float rbw,
                                                     unsigned int num_points)
  {
-  const float* inputPtr = (const float*)complexFFTInput;
-  float* destPtr = logPowerOutput;
-  uint64_t number = 0;
-  const float iRBW = 1.0 / rbw;
-  const float iNormalizationFactor = 1.0 / normalizationFactor;
+    const float* inputPtr = (const float*)complexFFTInput;
+    float* destPtr = logPowerOutput;
+    uint64_t number = 0;
+    const float iRBW = 1.0 / rbw;
+    const float iNormalizationFactor = 1.0 / normalizationFactor;
  
  #ifdef LV_HAVE_LIB_SIMDMATH
-  __m256 magScalar = _mm256_set1_ps(10.0);
-  magScalar = _mm256_div_ps(magScalar, logf4(magScalar));
+    __m256 magScalar = _mm256_set1_ps(10.0);
+    magScalar = _mm256_div_ps(magScalar, logf4(magScalar));
  
-  __m256 invRBW = _mm256_set1_ps(iRBW);
+    __m256 invRBW = _mm256_set1_ps(iRBW);
  
-  __m256 invNormalizationFactor = _mm256_set1_ps(iNormalizationFactor);
+    __m256 invNormalizationFactor = _mm256_set1_ps(iNormalizationFactor);
  
-  __m256 power;
-  __m256 input1, input2;
-  const uint64_t eighthPoints = num_points / 8;
-  for(;number < eighthPoints; number++){
-    // Load the complex values
-    input1 =_mm256_load_ps(inputPtr);
-    inputPtr += 8;
-    input2 =_mm256_load_ps(inputPtr);
-    inputPtr += 8;
+    __m256 power;
+    __m256 input1, input2;
+    const uint64_t eighthPoints = num_points / 8;
+    for (; number < eighthPoints; number++) {
+        // Load the complex values
+        input1 = _mm256_load_ps(inputPtr);
+        inputPtr += 8;
+        input2 = _mm256_load_ps(inputPtr);
+        inputPtr += 8;
  
-    // Apply the normalization factor
-    input1 = _mm256_mul_ps(input1, invNormalizationFactor);
-    input2 = _mm256_mul_ps(input2, invNormalizationFactor);
+        // Apply the normalization factor
+        input1 = _mm256_mul_ps(input1, invNormalizationFactor);
+        input2 = _mm256_mul_ps(input2, invNormalizationFactor);
  
-    // Multiply each value by itself
-    // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
-    input1 = _mm256_mul_ps(input1, input1);
-    // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
-    input2 = _mm256_mul_ps(input2, input2);
+        // Multiply each value by itself
+        // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+        input1 = _mm256_mul_ps(input1, input1);
+        // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+        input2 = _mm256_mul_ps(input2, input2);
  
-    // Horizontal add, to add (r*r) + (i*i) for each complex value
-    // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
-    inputVal1 = _mm256_permute2f128_ps(input1, input2, 0x20);
-    inputVal2 = _mm256_permute2f128_ps(input1, input2, 0x31);
+        // Horizontal add, to add (r*r) + (i*i) for each complex value
+        // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+        inputVal1 = _mm256_permute2f128_ps(input1, input2, 0x20);
+        inputVal2 = _mm256_permute2f128_ps(input1, input2, 0x31);
  
-    power = _mm256_hadd_ps(inputVal1, inputVal2);
+        power = _mm256_hadd_ps(inputVal1, inputVal2);
  
-    // Divide by the rbw
-    power = _mm256_mul_ps(power, invRBW);
+        // Divide by the rbw
+        power = _mm256_mul_ps(power, invRBW);
  
-    // Calculate the natural log power
-    power = logf4(power);
+        // Calculate the natural log power
+        power = logf4(power);
  
-    // Convert to log10 and multiply by 10.0
-    power = _mm256_mul_ps(power, magScalar);
+        // Convert to log10 and multiply by 10.0
+        power = _mm256_mul_ps(power, magScalar);
  
-    // Store the floating point results
-    _mm256_store_ps(destPtr, power);
+        // Store the floating point results
+        _mm256_store_ps(destPtr, power);
  
-    destPtr += 8;
-  }
+        destPtr += 8;
+    }
  
-  number = eighthPoints*8;
+    number = eighthPoints * 8;
  #endif /* LV_HAVE_LIB_SIMDMATH */
-  // Calculate the FFT for any remaining points
-  for(; number < num_points; number++){
-    // Calculate dBm
-    // 50 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
-    // 75 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
-
-    const float real = *inputPtr++ * iNormalizationFactor;
-    const float imag = *inputPtr++ * iNormalizationFactor;
-
-    *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
-    destPtr++;
-  }
-
+    // Calculate the FFT for any remaining points
+    for (; number < num_points; number++) {
+        // Calculate dBm
+        // 50 ohm load assumption
+        // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+        // 75 ohm load assumption
+        // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+        const float real = *inputPtr++ * iNormalizationFactor;
+        const float imag = *inputPtr++ * iNormalizationFactor;
+
+        *destPtr = 10.0 * log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
+        destPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX */
  
@@ -150,86 +152,86 @@ volk_32fc_s32f_x2_power_spectral_density_32f_a_avx(float* logPowerOutput, const
  #include <pmmintrin.h>
  
  
-
  #ifdef LV_HAVE_LIB_SIMDMATH
  #include <simdmath.h>
  #endif /* LV_HAVE_LIB_SIMDMATH */
  
  static inline void
-volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput,
-                                                    const float normalizationFactor, const float rbw,
+volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* logPowerOutput,
+                                                    const lv_32fc_t* complexFFTInput,
+                                                    const float normalizationFactor,
+                                                    const float rbw,
                                                      unsigned int num_points)
  {
-  const float* inputPtr = (const float*)complexFFTInput;
-  float* destPtr = logPowerOutput;
-  uint64_t number = 0;
-  const float iRBW = 1.0 / rbw;
-  const float iNormalizationFactor = 1.0 / normalizationFactor;
+    const float* inputPtr = (const float*)complexFFTInput;
+    float* destPtr = logPowerOutput;
+    uint64_t number = 0;
+    const float iRBW = 1.0 / rbw;
+    const float iNormalizationFactor = 1.0 / normalizationFactor;
  
  #ifdef LV_HAVE_LIB_SIMDMATH
-  __m128 magScalar = _mm_set_ps1(10.0);
-  magScalar = _mm_div_ps(magScalar, logf4(magScalar));
+    __m128 magScalar = _mm_set_ps1(10.0);
+    magScalar = _mm_div_ps(magScalar, logf4(magScalar));
  
-  __m128 invRBW = _mm_set_ps1(iRBW);
+    __m128 invRBW = _mm_set_ps1(iRBW);
  
-  __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
+    __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
  
-  __m128 power;
-  __m128 input1, input2;
-  const uint64_t quarterPoints = num_points / 4;
-  for(;number < quarterPoints; number++){
-    // Load the complex values
-    input1 =_mm_load_ps(inputPtr);
-    inputPtr += 4;
-    input2 =_mm_load_ps(inputPtr);
-    inputPtr += 4;
+    __m128 power;
+    __m128 input1, input2;
+    const uint64_t quarterPoints = num_points / 4;
+    for (; number < quarterPoints; number++) {
+        // Load the complex values
+        input1 = _mm_load_ps(inputPtr);
+        inputPtr += 4;
+        input2 = _mm_load_ps(inputPtr);
+        inputPtr += 4;
  
-    // Apply the normalization factor
-    input1 = _mm_mul_ps(input1, invNormalizationFactor);
-    input2 = _mm_mul_ps(input2, invNormalizationFactor);
+        // Apply the normalization factor
+        input1 = _mm_mul_ps(input1, invNormalizationFactor);
+        input2 = _mm_mul_ps(input2, invNormalizationFactor);
  
-    // Multiply each value by itself
-    // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
-    input1 = _mm_mul_ps(input1, input1);
-    // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
-    input2 = _mm_mul_ps(input2, input2);
+        // Multiply each value by itself
+        // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+        input1 = _mm_mul_ps(input1, input1);
+        // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+        input2 = _mm_mul_ps(input2, input2);
  
-    // Horizontal add, to add (r*r) + (i*i) for each complex value
-    // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
-    power = _mm_hadd_ps(input1, input2);
+        // Horizontal add, to add (r*r) + (i*i) for each complex value
+        // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+        power = _mm_hadd_ps(input1, input2);
  
-    // Divide by the rbw
-    power = _mm_mul_ps(power, invRBW);
+        // Divide by the rbw
+        power = _mm_mul_ps(power, invRBW);
  
-    // Calculate the natural log power
-    power = logf4(power);
+        // Calculate the natural log power
+        power = logf4(power);
  
-    // Convert to log10 and multiply by 10.0
-    power = _mm_mul_ps(power, magScalar);
+        // Convert to log10 and multiply by 10.0
+        power = _mm_mul_ps(power, magScalar);
  
-    // Store the floating point results
-    _mm_store_ps(destPtr, power);
+        // Store the floating point results
+        _mm_store_ps(destPtr, power);
  
-    destPtr += 4;
-  }
+        destPtr += 4;
+    }
  
-  number = quarterPoints*4;
+    number = quarterPoints * 4;
  #endif /* LV_HAVE_LIB_SIMDMATH */
-  // Calculate the FFT for any remaining points
-  for(; number < num_points; number++){
-    // Calculate dBm
-    // 50 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
-    // 75 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
-
-    const float real = *inputPtr++ * iNormalizationFactor;
-    const float imag = *inputPtr++ * iNormalizationFactor;
-
-    *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
-    destPtr++;
-  }
-
+    // Calculate the FFT for any remaining points
+    for (; number < num_points; number++) {
+        // Calculate dBm
+        // 50 ohm load assumption
+        // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+        // 75 ohm load assumption
+        // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+        const float real = *inputPtr++ * iNormalizationFactor;
+        const float imag = *inputPtr++ * iNormalizationFactor;
+
+        *destPtr = 10.0 * log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
+        destPtr++;
+    }
  }
  #endif /* LV_HAVE_SSE3 */
  
@@ -237,31 +239,34 @@ volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* logPowerOutput, const
  #ifdef LV_HAVE_GENERIC
  
  static inline void
-volk_32fc_s32f_x2_power_spectral_density_32f_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput,
-                                                     const float normalizationFactor, const float rbw,
+volk_32fc_s32f_x2_power_spectral_density_32f_generic(float* logPowerOutput,
+                                                     const lv_32fc_t* complexFFTInput,
+                                                     const float normalizationFactor,
+                                                     const float rbw,
                                                       unsigned int num_points)
  {
-  // Calculate the Power of the complex point
-  const float* inputPtr = (float*)complexFFTInput;
-  float* realFFTDataPointsPtr = logPowerOutput;
-  unsigned int point;
-  const float invRBW = 1.0 / rbw;
-  const float iNormalizationFactor = 1.0 / normalizationFactor;
-
-  for(point = 0; point < num_points; point++){
-    // Calculate dBm
-    // 50 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
-    // 75 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
-
-    const float real = *inputPtr++ * iNormalizationFactor;
-    const float imag = *inputPtr++ * iNormalizationFactor;
-
-    *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW);
-
-    realFFTDataPointsPtr++;
-  }
+    // Calculate the Power of the complex point
+    const float* inputPtr = (float*)complexFFTInput;
+    float* realFFTDataPointsPtr = logPowerOutput;
+    unsigned int point;
+    const float invRBW = 1.0 / rbw;
+    const float iNormalizationFactor = 1.0 / normalizationFactor;
+
+    for (point = 0; point < num_points; point++) {
+        // Calculate dBm
+        // 50 ohm load assumption
+        // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+        // 75 ohm load assumption
+        // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+        const float real = *inputPtr++ * iNormalizationFactor;
+        const float imag = *inputPtr++ * iNormalizationFactor;
+
+        *realFFTDataPointsPtr =
+            10.0 * log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW);
+
+        realFFTDataPointsPtr++;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
diff --git a/kernels/volk/volk_32fc_s32fc_multiply_32fc.h b/kernels/volk/volk_32fc_s32fc_multiply_32fc.h

index fe416b41e07a889278196bb12a52ce6f27a5884d..840008abaab257683c7eca8fc571cbf0774b1eab 100644 (file)
--- a/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
+++ b/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_s32fc_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points);
- * \endcode
+ * void volk_32fc_s32fc_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
+ * lv_32fc_t scalar, unsigned int num_points); \endcode
   *
   * \b Inputs
   * \li aVector: The input vector to be multiplied.
@@ -76,15 +76,19 @@
  #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
  #define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
  
+#include <float.h>
  #include <inttypes.h>
  #include <stdio.h>
  #include <volk/volk_complex.h>
-#include <float.h>
  
  #if LV_HAVE_AVX && LV_HAVE_FMA
  #include <immintrin.h>
  
-static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(lv_32fc_t* cVector,
+                                                           const lv_32fc_t* aVector,
+                                                           const lv_32fc_t scalar,
+                                                           unsigned int num_points)
+{
      unsigned int number = 0;
      unsigned int i = 0;
      const unsigned int quarterPoints = num_points / 4;
@@ -97,34 +101,38 @@ static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(lv_32fc_t* cVector, c
      yl = _mm256_set1_ps(lv_creal(scalar));
      yh = _mm256_set1_ps(lv_cimag(scalar));
  
-    for(;number < quarterPoints; number++){
-      x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+    for (; number < quarterPoints; number++) {
+        x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
  
-      tmp1 = x;
+        tmp1 = x;
  
-      x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+        x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
  
-      tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+        tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
  
-      z = _mm256_fmaddsub_ps(tmp1, yl,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+        z = _mm256_fmaddsub_ps(
+            tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
  
-      _mm256_storeu_ps((float*)c,z); // Store the results back into the C container
+        _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
  
-      a += 4;
-      c += 4;
+        a += 4;
+        c += 4;
      }
  
-    for(i = num_points-isodd; i < num_points; i++) {
+    for (i = num_points - isodd; i < num_points; i++) {
          *c++ = (*a++) * scalar;
      }
-
  }
  #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector,
+                                                       const lv_32fc_t* aVector,
+                                                       const lv_32fc_t scalar,
+                                                       unsigned int num_points)
+{
      unsigned int number = 0;
      unsigned int i = 0;
      const unsigned int quarterPoints = num_points / 4;
@@ -137,35 +145,39 @@ static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector, const
      yl = _mm256_set1_ps(lv_creal(scalar));
      yh = _mm256_set1_ps(lv_cimag(scalar));
  
-    for(;number < quarterPoints; number++){
-      x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+    for (; number < quarterPoints; number++) {
+        x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
  
-      tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+        tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
  
-      x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+        x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
  
-      tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+        tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
  
-      z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+        z = _mm256_addsub_ps(tmp1,
+                             tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
  
-      _mm256_storeu_ps((float*)c,z); // Store the results back into the C container
+        _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
  
-      a += 4;
-      c += 4;
+        a += 4;
+        c += 4;
      }
  
-    for(i = num_points-isodd; i < num_points; i++) {
+    for (i = num_points - isodd; i < num_points; i++) {
          *c++ = (*a++) * scalar;
      }
-
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_SSE3
  #include <pmmintrin.h>
  
-static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
-  unsigned int number = 0;
+static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector,
+                                                        const lv_32fc_t* aVector,
+                                                        const lv_32fc_t scalar,
+                                                        unsigned int num_points)
+{
+    unsigned int number = 0;
      const unsigned int halfPoints = num_points / 2;
  
      __m128 x, yl, yh, z, tmp1, tmp2;
@@ -176,53 +188,58 @@ static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, cons
      yl = _mm_set_ps1(lv_creal(scalar));
      yh = _mm_set_ps1(lv_cimag(scalar));
  
-    for(;number < halfPoints; number++){
+    for (; number < halfPoints; number++) {
  
-      x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+        x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
  
-      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+        tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
  
-      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+        x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
  
-      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+        tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
  
-      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+        z = _mm_addsub_ps(tmp1,
+                          tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
  
-      _mm_storeu_ps((float*)c,z); // Store the results back into the C container
+        _mm_storeu_ps((float*)c, z); // Store the results back into the C container
  
-      a += 2;
-      c += 2;
+        a += 2;
+        c += 2;
      }
  
-    if((num_points % 2) != 0) {
-      *c = (*a) * scalar;
+    if ((num_points % 2) != 0) {
+        *c = (*a) * scalar;
      }
  }
  #endif /* LV_HAVE_SSE */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector,
+                                                         const lv_32fc_t* aVector,
+                                                         const lv_32fc_t scalar,
+                                                         unsigned int num_points)
+{
      lv_32fc_t* cPtr = cVector;
      const lv_32fc_t* aPtr = aVector;
      unsigned int number = num_points;
  
      // unwrap loop
-    while (number >= 8){
-      *cPtr++ = (*aPtr++) * scalar;
-      *cPtr++ = (*aPtr++) * scalar;
-      *cPtr++ = (*aPtr++) * scalar;
-      *cPtr++ = (*aPtr++) * scalar;
-      *cPtr++ = (*aPtr++) * scalar;
-      *cPtr++ = (*aPtr++) * scalar;
-      *cPtr++ = (*aPtr++) * scalar;
-      *cPtr++ = (*aPtr++) * scalar;
-      number -= 8;
+    while (number >= 8) {
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        number -= 8;
      }
  
      // clean up any remaining
      while (number-- > 0)
-      *cPtr++ = *aPtr++ * scalar;
+        *cPtr++ = *aPtr++ * scalar;
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -231,15 +248,19 @@ static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, con
  #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
  #define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
  
+#include <float.h>
  #include <inttypes.h>
  #include <stdio.h>
  #include <volk/volk_complex.h>
-#include <float.h>
  
  #if LV_HAVE_AVX && LV_HAVE_FMA
  #include <immintrin.h>
  
-static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector,
+                                                           const lv_32fc_t* aVector,
+                                                           const lv_32fc_t scalar,
+                                                           unsigned int num_points)
+{
      unsigned int number = 0;
      unsigned int i = 0;
      const unsigned int quarterPoints = num_points / 4;
@@ -252,27 +273,27 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector, c
      yl = _mm256_set1_ps(lv_creal(scalar));
      yh = _mm256_set1_ps(lv_cimag(scalar));
  
-    for(;number < quarterPoints; number++){
-      x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+    for (; number < quarterPoints; number++) {
+        x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
  
-      tmp1 = x;
+        tmp1 = x;
  
-      x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+        x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
  
-      tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+        tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
  
-      z = _mm256_fmaddsub_ps(tmp1, yl,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+        z = _mm256_fmaddsub_ps(
+            tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
  
-      _mm256_store_ps((float*)c,z); // Store the results back into the C container
+        _mm256_store_ps((float*)c, z); // Store the results back into the C container
  
-      a += 4;
-      c += 4;
+        a += 4;
+        c += 4;
      }
  
-    for(i = num_points-isodd; i < num_points; i++) {
+    for (i = num_points - isodd; i < num_points; i++) {
          *c++ = (*a++) * scalar;
      }
-
  }
  #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
  
@@ -280,7 +301,11 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector, c
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector,
+                                                       const lv_32fc_t* aVector,
+                                                       const lv_32fc_t scalar,
+                                                       unsigned int num_points)
+{
      unsigned int number = 0;
      unsigned int i = 0;
      const unsigned int quarterPoints = num_points / 4;
@@ -293,35 +318,39 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector, const
      yl = _mm256_set1_ps(lv_creal(scalar));
      yh = _mm256_set1_ps(lv_cimag(scalar));
  
-    for(;number < quarterPoints; number++){
-      x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+    for (; number < quarterPoints; number++) {
+        x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
  
-      tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+        tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
  
-      x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+        x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
  
-      tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+        tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
  
-      z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+        z = _mm256_addsub_ps(tmp1,
+                             tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
  
-      _mm256_store_ps((float*)c,z); // Store the results back into the C container
+        _mm256_store_ps((float*)c, z); // Store the results back into the C container
  
-      a += 4;
-      c += 4;
+        a += 4;
+        c += 4;
      }
  
-    for(i = num_points-isodd; i < num_points; i++) {
+    for (i = num_points - isodd; i < num_points; i++) {
          *c++ = (*a++) * scalar;
      }
-
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_SSE3
  #include <pmmintrin.h>
  
-static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
-  unsigned int number = 0;
+static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector,
+                                                        const lv_32fc_t* aVector,
+                                                        const lv_32fc_t scalar,
+                                                        unsigned int num_points)
+{
+    unsigned int number = 0;
      const unsigned int halfPoints = num_points / 2;
  
      __m128 x, yl, yh, z, tmp1, tmp2;
@@ -332,26 +361,27 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, cons
      yl = _mm_set_ps1(lv_creal(scalar));
      yh = _mm_set_ps1(lv_cimag(scalar));
  
-    for(;number < halfPoints; number++){
+    for (; number < halfPoints; number++) {
  
-      x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+        x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
  
-      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+        tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
  
-      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+        x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
  
-      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+        tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
  
-      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+        z = _mm_addsub_ps(tmp1,
+                          tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
  
-      _mm_store_ps((float*)c,z); // Store the results back into the C container
+        _mm_store_ps((float*)c, z); // Store the results back into the C container
  
-      a += 2;
-      c += 2;
+        a += 2;
+        c += 2;
      }
  
-    if((num_points % 2) != 0) {
-      *c = (*a) * scalar;
+    if ((num_points % 2) != 0) {
+        *c = (*a) * scalar;
      }
  }
  #endif /* LV_HAVE_SSE */
@@ -359,7 +389,11 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, cons
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector,
+                                                      const lv_32fc_t* aVector,
+                                                      const lv_32fc_t scalar,
+                                                      unsigned int num_points)
+{
      lv_32fc_t* cPtr = cVector;
      const lv_32fc_t* aPtr = aVector;
      unsigned int number = num_points;
@@ -370,7 +404,7 @@ static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector, const
  
      scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar);
      scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1);
-    for(number = 0; number < quarter_points; ++number) {
+    for (number = 0; number < quarter_points; ++number) {
          a_val = vld2q_f32((float*)aPtr);
          tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]);
          tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]);
@@ -383,35 +417,39 @@ static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector, const
          cPtr += 4;
      }
  
-    for(number = quarter_points*4; number < num_points; number++){
-      *cPtr++ = *aPtr++ * scalar;
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *cPtr++ = *aPtr++ * scalar;
      }
  }
  #endif /* LV_HAVE_NEON */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector,
+                                                           const lv_32fc_t* aVector,
+                                                           const lv_32fc_t scalar,
+                                                           unsigned int num_points)
+{
      lv_32fc_t* cPtr = cVector;
      const lv_32fc_t* aPtr = aVector;
      unsigned int number = num_points;
  
      // unwrap loop
-    while (number >= 8){
-      *cPtr++ = (*aPtr++) * scalar;
-      *cPtr++ = (*aPtr++) * scalar;
-      *cPtr++ = (*aPtr++) * scalar;
-      *cPtr++ = (*aPtr++) * scalar;
-      *cPtr++ = (*aPtr++) * scalar;
-      *cPtr++ = (*aPtr++) * scalar;
-      *cPtr++ = (*aPtr++) * scalar;
-      *cPtr++ = (*aPtr++) * scalar;
-      number -= 8;
+    while (number >= 8) {
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        number -= 8;
      }
  
      // clean up any remaining
      while (number-- > 0)
-      *cPtr++ = *aPtr++ * scalar;
+        *cPtr++ = *aPtr++ * scalar;
  }
  #endif /* LV_HAVE_GENERIC */
  
diff --git a/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h b/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h

index 181abc52141e714d5c666bedcfa5c4ab26dbb9e1..eba98fecfe5304cbded3d8d96dc9bf843f280e11 100644 (file)
--- a/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h
+++ b/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h
@@ -25,19 +25,24 @@
  #define INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H
  
  
-#include <volk/volk_complex.h>
  #include <stdio.h>
  #include <volk/volk_32fc_s32fc_x2_rotator_32fc.h>
+#include <volk/volk_complex.h>
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
-    lv_32fc_t phase[1] = {lv_cmake(.3, 0.95393)};
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVector,
+                                                              const lv_32fc_t* inVector,
+                                                              const lv_32fc_t phase_inc,
+                                                              unsigned int num_points)
+{
+    lv_32fc_t phase[1] = { lv_cmake(.3, 0.95393) };
      (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
-    const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
-    volk_32fc_s32fc_x2_rotator_32fc_generic(outVector, inVector, phase_inc_n, phase, num_points);
-
+    const lv_32fc_t phase_inc_n =
+        phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+    volk_32fc_s32fc_x2_rotator_32fc_generic(
+        outVector, inVector, phase_inc_n, phase, num_points);
  }
  
  #endif /* LV_HAVE_GENERIC */
@@ -47,12 +52,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVect
  #include <arm_neon.h>
  #include <volk/volk_neon_intrinsics.h>
  
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_neon(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
-    lv_32fc_t phase[1] = {lv_cmake(.3, 0.95393)};
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_neon(lv_32fc_t* outVector,
+                                                           const lv_32fc_t* inVector,
+                                                           const lv_32fc_t phase_inc,
+                                                           unsigned int num_points)
+{
+    lv_32fc_t phase[1] = { lv_cmake(.3, 0.95393) };
      (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
-    const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
-    volk_32fc_s32fc_x2_rotator_32fc_neon(outVector, inVector, phase_inc_n, phase, num_points);
-    
+    const lv_32fc_t phase_inc_n =
+        phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+    volk_32fc_s32fc_x2_rotator_32fc_neon(
+        outVector, inVector, phase_inc_n, phase, num_points);
  }
  
  #endif /* LV_HAVE_NEON */
@@ -61,12 +71,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_neon(lv_32fc_t* outVector,
  #ifdef LV_HAVE_SSE4_1
  #include <smmintrin.h>
  
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
-    lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector,
+                                                               const lv_32fc_t* inVector,
+                                                               const lv_32fc_t phase_inc,
+                                                               unsigned int num_points)
+{
+    lv_32fc_t phase[1] = { lv_cmake(.3, .95393) };
      (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
-    const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
-    volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(outVector, inVector, phase_inc_n, phase, num_points);
-
+    const lv_32fc_t phase_inc_n =
+        phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+    volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(
+        outVector, inVector, phase_inc_n, phase, num_points);
  }
  
  #endif /* LV_HAVE_SSE4_1 */
@@ -74,12 +89,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVec
  
  #ifdef LV_HAVE_SSE4_1
  #include <smmintrin.h>
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
-    lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVector,
+                                                               const lv_32fc_t* inVector,
+                                                               const lv_32fc_t phase_inc,
+                                                               unsigned int num_points)
+{
+    lv_32fc_t phase[1] = { lv_cmake(.3, .95393) };
      (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
-    const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
-    volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(outVector, inVector, phase_inc_n, phase, num_points);
-
+    const lv_32fc_t phase_inc_n =
+        phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+    volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(
+        outVector, inVector, phase_inc_n, phase, num_points);
  }
  
  #endif /* LV_HAVE_SSE4_1 */
@@ -88,11 +108,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVec
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
-    lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector,
+                                                            const lv_32fc_t* inVector,
+                                                            const lv_32fc_t phase_inc,
+                                                            unsigned int num_points)
+{
+    lv_32fc_t phase[1] = { lv_cmake(.3, .95393) };
      (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
-    const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
-    volk_32fc_s32fc_x2_rotator_32fc_a_avx(outVector, inVector, phase_inc_n, phase, num_points);
+    const lv_32fc_t phase_inc_n =
+        phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+    volk_32fc_s32fc_x2_rotator_32fc_a_avx(
+        outVector, inVector, phase_inc_n, phase, num_points);
  }
  
  #endif /* LV_HAVE_AVX */
@@ -101,11 +127,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
-    lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx(lv_32fc_t* outVector,
+                                                            const lv_32fc_t* inVector,
+                                                            const lv_32fc_t phase_inc,
+                                                            unsigned int num_points)
+{
+    lv_32fc_t phase[1] = { lv_cmake(.3, .95393) };
      (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
-    const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
-    volk_32fc_s32fc_x2_rotator_32fc_u_avx(outVector, inVector, phase_inc_n, phase, num_points);
+    const lv_32fc_t phase_inc_n =
+        phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+    volk_32fc_s32fc_x2_rotator_32fc_u_avx(
+        outVector, inVector, phase_inc_n, phase, num_points);
  }
  
  #endif /* LV_HAVE_AVX */
@@ -113,11 +145,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx(lv_32fc_t* outVector
  #if LV_HAVE_AVX && LV_HAVE_FMA
  #include <immintrin.h>
  
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
-    lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx_fma(lv_32fc_t* outVector,
+                                                                const lv_32fc_t* inVector,
+                                                                const lv_32fc_t phase_inc,
+                                                                unsigned int num_points)
+{
+    lv_32fc_t phase[1] = { lv_cmake(.3, .95393) };
      (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
-    const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
-    volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(outVector, inVector, phase_inc_n, phase, num_points);
+    const lv_32fc_t phase_inc_n =
+        phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+    volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(
+        outVector, inVector, phase_inc_n, phase, num_points);
  }
  
  #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
@@ -126,11 +164,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx_fma(lv_32fc_t* outVe
  #if LV_HAVE_AVX && LV_HAVE_FMA
  #include <immintrin.h>
  
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
-    lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx_fma(lv_32fc_t* outVector,
+                                                                const lv_32fc_t* inVector,
+                                                                const lv_32fc_t phase_inc,
+                                                                unsigned int num_points)
+{
+    lv_32fc_t phase[1] = { lv_cmake(.3, .95393) };
      (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
-    const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
-    volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(outVector, inVector, phase_inc_n, phase, num_points);
+    const lv_32fc_t phase_inc_n =
+        phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+    volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(
+        outVector, inVector, phase_inc_n, phase, num_points);
  }
  
  #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
diff --git a/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h b/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h

index a886458fdfe18fd768bf51bae44169f123d02e7f..c97b8cb159eecaa8d7fffab44304dbdf65ae9344 100644 (file)
--- a/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h
+++ b/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h
@@ -30,14 +30,15 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_s32fc_x2_rotator_32fc(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
- * \endcode
+ * void volk_32fc_s32fc_x2_rotator_32fc(lv_32fc_t* outVector, const lv_32fc_t* inVector,
+ * const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li inVector: Vector to be rotated.
   * \li phase_inc: rotational velocity.
   * \li phase: initial phase offset.
- * \li num_points: The number of values in inVector to be rotated and stored into outVector.
+ * \li num_points: The number of values in inVector to be rotated and stored into
+ * outVector.
   *
   * \b Outputs
   * \li outVector: The vector where the results will be stored.
@@ -81,31 +82,36 @@
  #define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
  
  
-#include <volk/volk_complex.h>
+#include <math.h>
  #include <stdio.h>
  #include <stdlib.h>
-#include <math.h>
+#include <volk/volk_complex.h>
  #define ROTATOR_RELOAD 512
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector,
+                                                           const lv_32fc_t* inVector,
+                                                           const lv_32fc_t phase_inc,
+                                                           lv_32fc_t* phase,
+                                                           unsigned int num_points)
+{
      unsigned int i = 0;
      int j = 0;
-    for(i = 0; i < (unsigned int)(num_points/ROTATOR_RELOAD); ++i) {
-        for(j = 0; j < ROTATOR_RELOAD; ++j) {
+    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
+        for (j = 0; j < ROTATOR_RELOAD; ++j) {
              *outVector++ = *inVector++ * (*phase);
              (*phase) *= phase_inc;
          }
  
          (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
      }
-    for(i = 0; i < num_points%ROTATOR_RELOAD; ++i) {
+    for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) {
          *outVector++ = *inVector++ * (*phase);
          (*phase) *= phase_inc;
      }
-    if(i){
+    if (i) {
          // Make sure, we normalize phase on every call!
          (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
      }
@@ -118,43 +124,47 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector,
  #include <arm_neon.h>
  #include <volk/volk_neon_intrinsics.h>
  
-static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
+static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector,
+                                                        const lv_32fc_t* inVector,
+                                                        const lv_32fc_t phase_inc,
+                                                        lv_32fc_t* phase,
+                                                        unsigned int num_points)
  
  {
      lv_32fc_t* outputVectorPtr = outVector;
      const lv_32fc_t* inputVectorPtr = inVector;
      lv_32fc_t incr = 1;
-    lv_32fc_t phasePtr[4] = {(*phase), (*phase), (*phase), (*phase)};
+    lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) };
      float32x4x2_t input_vec;
      float32x4x2_t output_vec;
-    
+
      unsigned int i = 0, j = 0;
      const unsigned int quarter_points = num_points / 4;
-    
-    for(i = 0; i < 4; ++i) {
+
+    for (i = 0; i < 4; ++i) {
          phasePtr[i] *= incr;
          incr *= (phase_inc);
      }
-    
+
      // Notice that incr has be incremented in the previous loop
-    const lv_32fc_t incrPtr[4] = {incr, incr, incr, incr};
-    const float32x4x2_t incr_vec = vld2q_f32((float*) incrPtr);
-    float32x4x2_t phase_vec = vld2q_f32((float*) phasePtr);
-    
-    for(i = 0; i < (unsigned int)(quarter_points/ROTATOR_RELOAD); i++) {
-        for(j = 0; j < ROTATOR_RELOAD; j++) {
-            input_vec = vld2q_f32((float*) inputVectorPtr);
+    const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
+    const float32x4x2_t incr_vec = vld2q_f32((float*)incrPtr);
+    float32x4x2_t phase_vec = vld2q_f32((float*)phasePtr);
+
+    for (i = 0; i < (unsigned int)(quarter_points / ROTATOR_RELOAD); i++) {
+        for (j = 0; j < ROTATOR_RELOAD; j++) {
+            input_vec = vld2q_f32((float*)inputVectorPtr);
              // Prefetch next one, speeds things up
-            __VOLK_PREFETCH(inputVectorPtr+4);
+            __VOLK_PREFETCH(inputVectorPtr + 4);
              // Rotate
              output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
              // Increase phase
              phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
              // Store output
              vst2q_f32((float*)outputVectorPtr, output_vec);
-            
-            outputVectorPtr+=4;
-            inputVectorPtr+=4;
+
+            outputVectorPtr += 4;
+            inputVectorPtr += 4;
          }
          // normalize phase so magnitude doesn't grow because of
          // floating point rounding error
@@ -164,20 +174,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector, co
          phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
          phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
      }
-    
-    for(i = 0; i < quarter_points % ROTATOR_RELOAD; i++) {
-        input_vec = vld2q_f32((float*) inputVectorPtr);
+
+    for (i = 0; i < quarter_points % ROTATOR_RELOAD; i++) {
+        input_vec = vld2q_f32((float*)inputVectorPtr);
          // Prefetch next one, speeds things up
-        __VOLK_PREFETCH(inputVectorPtr+4);
+        __VOLK_PREFETCH(inputVectorPtr + 4);
          // Rotate
          output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
          // Increase phase
          phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
          // Store output
          vst2q_f32((float*)outputVectorPtr, output_vec);
-        
-        outputVectorPtr+=4;
-        inputVectorPtr+=4;
+
+        outputVectorPtr += 4;
+        inputVectorPtr += 4;
      }
      // if(i) == true means we looped above
      if (i) {
@@ -191,13 +201,13 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector, co
      }
      // Store current phase
      vst2q_f32((float*)phasePtr, phase_vec);
-    
+
      // Deal with the rest
-    for(i = 0; i < num_points % 4; i++) {
+    for (i = 0; i < num_points % 4; i++) {
          *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
          phasePtr[0] *= (phase_inc);
      }
-    
+
      // For continious phase next time we need to call this function
      (*phase) = phasePtr[0];
  }
@@ -208,15 +218,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector, co
  #ifdef LV_HAVE_SSE4_1
  #include <smmintrin.h>
  
-static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector,
+                                                            const lv_32fc_t* inVector,
+                                                            const lv_32fc_t phase_inc,
+                                                            lv_32fc_t* phase,
+                                                            unsigned int num_points)
+{
      lv_32fc_t* cPtr = outVector;
      const lv_32fc_t* aPtr = inVector;
      lv_32fc_t incr = 1;
-    lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
+    lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
  
      unsigned int i, j = 0;
  
-    for(i = 0; i < 2; ++i) {
+    for (i = 0; i < 2; ++i) {
          phase_Ptr[i] *= incr;
          incr *= (phase_inc);
      }
@@ -227,13 +242,13 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector
      __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
  
      phase_Val = _mm_loadu_ps((float*)phase_Ptr);
-    inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
+    inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
  
      const unsigned int halfPoints = num_points / 2;
  
  
-    for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) {
-        for(j = 0; j < ROTATOR_RELOAD; ++j) {
+    for (i = 0; i < (unsigned int)(halfPoints / ROTATOR_RELOAD); i++) {
+        for (j = 0; j < ROTATOR_RELOAD; ++j) {
  
              aVal = _mm_load_ps((float*)aPtr);
  
@@ -264,7 +279,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector
          tmp2 = _mm_sqrt_ps(tmp1);
          phase_Val = _mm_div_ps(phase_Val, tmp2);
      }
-    for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) {
+    for (i = 0; i < halfPoints % ROTATOR_RELOAD; ++i) {
          aVal = _mm_load_ps((float*)aPtr);
  
          yl = _mm_moveldup_ps(phase_Val);
@@ -304,7 +319,6 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector
      }
  
      (*phase) = phase_Ptr[0];
-
  }
  
  #endif /* LV_HAVE_SSE4_1 for aligned */
@@ -313,15 +327,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector
  #ifdef LV_HAVE_SSE4_1
  #include <smmintrin.h>
  
-static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector,
+                                                            const lv_32fc_t* inVector,
+                                                            const lv_32fc_t phase_inc,
+                                                            lv_32fc_t* phase,
+                                                            unsigned int num_points)
+{
      lv_32fc_t* cPtr = outVector;
      const lv_32fc_t* aPtr = inVector;
      lv_32fc_t incr = 1;
-    lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
+    lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
  
      unsigned int i, j = 0;
  
-    for(i = 0; i < 2; ++i) {
+    for (i = 0; i < 2; ++i) {
          phase_Ptr[i] *= incr;
          incr *= (phase_inc);
      }
@@ -332,13 +351,13 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector
      __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
  
      phase_Val = _mm_loadu_ps((float*)phase_Ptr);
-    inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
+    inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
  
      const unsigned int halfPoints = num_points / 2;
  
  
-    for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) {
-        for(j = 0; j < ROTATOR_RELOAD; ++j) {
+    for (i = 0; i < (unsigned int)(halfPoints / ROTATOR_RELOAD); i++) {
+        for (j = 0; j < ROTATOR_RELOAD; ++j) {
  
              aVal = _mm_loadu_ps((float*)aPtr);
  
@@ -369,7 +388,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector
          tmp2 = _mm_sqrt_ps(tmp1);
          phase_Val = _mm_div_ps(phase_Val, tmp2);
      }
-    for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) {
+    for (i = 0; i < halfPoints % ROTATOR_RELOAD; ++i) {
          aVal = _mm_loadu_ps((float*)aPtr);
  
          yl = _mm_moveldup_ps(phase_Val);
@@ -409,7 +428,6 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector
      }
  
      (*phase) = phase_Ptr[0];
-
  }
  
  #endif /* LV_HAVE_SSE4_1 */
@@ -419,15 +437,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector
  #include <immintrin.h>
  #include <volk/volk_avx_intrinsics.h>
  
-static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector,
+                                                         const lv_32fc_t* inVector,
+                                                         const lv_32fc_t phase_inc,
+                                                         lv_32fc_t* phase,
+                                                         unsigned int num_points)
+{
      lv_32fc_t* cPtr = outVector;
      const lv_32fc_t* aPtr = inVector;
      lv_32fc_t incr = lv_cmake(1.0, 0.0);
-    lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
+    lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
  
      unsigned int i, j = 0;
  
-    for(i = 0; i < 4; ++i) {
+    for (i = 0; i < 4; ++i) {
          phase_Ptr[i] *= incr;
          incr *= (phase_inc);
      }
@@ -435,16 +458,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c
      __m256 aVal, phase_Val, z;
  
      phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
-    
-    const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),
-                                         lv_cimag(incr), lv_creal(incr),
-                                         lv_cimag(incr), lv_creal(incr),
-                                         lv_cimag(incr), lv_creal(incr));
+
+    const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
+                                         lv_creal(incr),
+                                         lv_cimag(incr),
+                                         lv_creal(incr),
+                                         lv_cimag(incr),
+                                         lv_creal(incr),
+                                         lv_cimag(incr),
+                                         lv_creal(incr));
  
      const unsigned int fourthPoints = num_points / 4;
  
-    for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
-        for(j = 0; j < ROTATOR_RELOAD; ++j) {
+    for (i = 0; i < (unsigned int)(fourthPoints / ROTATOR_RELOAD); i++) {
+        for (j = 0; j < ROTATOR_RELOAD; ++j) {
  
              aVal = _mm256_load_ps((float*)aPtr);
  
@@ -458,8 +485,8 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c
          }
          phase_Val = _mm256_normalize_ps(phase_Val);
      }
-    
-    for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
+
+    for (i = 0; i < fourthPoints % ROTATOR_RELOAD; ++i) {
          aVal = _mm256_load_ps((float*)aPtr);
  
          z = _mm256_complexmul_ps(aVal, phase_Val);
@@ -473,10 +500,10 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c
      if (i) {
          phase_Val = _mm256_normalize_ps(phase_Val);
      }
-    
+
      _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
      (*phase) = phase_Ptr[0];
-    volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points%4);
+    volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
  }
  
  #endif /* LV_HAVE_AVX for aligned */
@@ -486,15 +513,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c
  #include <immintrin.h>
  #include <volk/volk_avx_intrinsics.h>
  
-static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector,
+                                                         const lv_32fc_t* inVector,
+                                                         const lv_32fc_t phase_inc,
+                                                         lv_32fc_t* phase,
+                                                         unsigned int num_points)
+{
      lv_32fc_t* cPtr = outVector;
      const lv_32fc_t* aPtr = inVector;
      lv_32fc_t incr = lv_cmake(1.0, 0.0);
-    lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
+    lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
  
      unsigned int i, j = 0;
  
-    for(i = 0; i < 4; ++i) {
+    for (i = 0; i < 4; ++i) {
          phase_Ptr[i] *= incr;
          incr *= (phase_inc);
      }
@@ -502,19 +534,23 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, c
      __m256 aVal, phase_Val, z;
  
      phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
-    
-    const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),
-                                         lv_cimag(incr), lv_creal(incr),
-                                         lv_cimag(incr), lv_creal(incr),
-                                         lv_cimag(incr), lv_creal(incr));
-    
+
+    const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
+                                         lv_creal(incr),
+                                         lv_cimag(incr),
+                                         lv_creal(incr),
+                                         lv_cimag(incr),
+                                         lv_creal(incr),
+                                         lv_cimag(incr),
+                                         lv_creal(incr));
+
      const unsigned int fourthPoints = num_points / 4;
  
-    for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); ++i) {
-        for(j = 0; j < ROTATOR_RELOAD; ++j) {
+    for (i = 0; i < (unsigned int)(fourthPoints / ROTATOR_RELOAD); ++i) {
+        for (j = 0; j < ROTATOR_RELOAD; ++j) {
  
              aVal = _mm256_loadu_ps((float*)aPtr);
-            
+
              z = _mm256_complexmul_ps(aVal, phase_Val);
              phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
  
@@ -524,10 +560,9 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, c
              cPtr += 4;
          }
          phase_Val = _mm256_normalize_ps(phase_Val);
-        
      }
-    
-    for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
+
+    for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) {
          aVal = _mm256_loadu_ps((float*)aPtr);
  
          z = _mm256_complexmul_ps(aVal, phase_Val);
@@ -544,7 +579,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, c
  
      _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
      (*phase) = phase_Ptr[0];
-    volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points%4);
+    volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
  }
  
  #endif /* LV_HAVE_AVX */
@@ -552,15 +587,21 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, c
  #if LV_HAVE_AVX && LV_HAVE_FMA
  #include <immintrin.h>
  
-static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVector,
+                                                             const lv_32fc_t* inVector,
+                                                             const lv_32fc_t phase_inc,
+                                                             lv_32fc_t* phase,
+                                                             unsigned int num_points)
+{
      lv_32fc_t* cPtr = outVector;
      const lv_32fc_t* aPtr = inVector;
      lv_32fc_t incr = 1;
-    __VOLK_ATTR_ALIGNED(32) lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
+    __VOLK_ATTR_ALIGNED(32)
+    lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
  
      unsigned int i, j = 0;
  
-    for(i = 0; i < 4; ++i) {
+    for (i = 0; i < 4; ++i) {
          phase_Ptr[i] *= incr;
          incr *= (phase_inc);
      }
@@ -568,11 +609,18 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVecto
      __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
  
      phase_Val = _mm256_load_ps((float*)phase_Ptr);
-    inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
+    inc_Val = _mm256_set_ps(lv_cimag(incr),
+                            lv_creal(incr),
+                            lv_cimag(incr),
+                            lv_creal(incr),
+                            lv_cimag(incr),
+                            lv_creal(incr),
+                            lv_cimag(incr),
+                            lv_creal(incr));
      const unsigned int fourthPoints = num_points / 4;
  
-    for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
-        for(j = 0; j < ROTATOR_RELOAD; ++j) {
+    for (i = 0; i < (unsigned int)(fourthPoints / ROTATOR_RELOAD); i++) {
+        for (j = 0; j < ROTATOR_RELOAD; ++j) {
  
              aVal = _mm256_load_ps((float*)aPtr);
  
@@ -603,7 +651,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVecto
          tmp2 = _mm256_sqrt_ps(tmp1);
          phase_Val = _mm256_div_ps(phase_Val, tmp2);
      }
-    for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
+    for (i = 0; i < fourthPoints % ROTATOR_RELOAD; ++i) {
          aVal = _mm256_load_ps((float*)aPtr);
  
          yl = _mm256_moveldup_ps(phase_Val);
@@ -636,13 +684,12 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVecto
      }
  
      _mm256_store_ps((float*)phase_Ptr, phase_Val);
-    for(i = 0; i < num_points%4; ++i) {
+    for (i = 0; i < num_points % 4; ++i) {
          *cPtr++ = *aPtr++ * phase_Ptr[0];
          phase_Ptr[0] *= (phase_inc);
      }
  
      (*phase) = phase_Ptr[0];
-
  }
  
  #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned*/
@@ -650,15 +697,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVecto
  #if LV_HAVE_AVX && LV_HAVE_FMA
  #include <immintrin.h>
  
-static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVector,
+                                                             const lv_32fc_t* inVector,
+                                                             const lv_32fc_t phase_inc,
+                                                             lv_32fc_t* phase,
+                                                             unsigned int num_points)
+{
      lv_32fc_t* cPtr = outVector;
      const lv_32fc_t* aPtr = inVector;
      lv_32fc_t incr = 1;
-    lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
+    lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
  
      unsigned int i, j = 0;
  
-    for(i = 0; i < 4; ++i) {
+    for (i = 0; i < 4; ++i) {
          phase_Ptr[i] *= incr;
          incr *= (phase_inc);
      }
@@ -666,11 +718,18 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVecto
      __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
  
      phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
-    inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
+    inc_Val = _mm256_set_ps(lv_cimag(incr),
+                            lv_creal(incr),
+                            lv_cimag(incr),
+                            lv_creal(incr),
+                            lv_cimag(incr),
+                            lv_creal(incr),
+                            lv_cimag(incr),
+                            lv_creal(incr));
      const unsigned int fourthPoints = num_points / 4;
  
-    for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
-        for(j = 0; j < ROTATOR_RELOAD; ++j) {
+    for (i = 0; i < (unsigned int)(fourthPoints / ROTATOR_RELOAD); i++) {
+        for (j = 0; j < ROTATOR_RELOAD; ++j) {
  
              aVal = _mm256_loadu_ps((float*)aPtr);
  
@@ -701,7 +760,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVecto
          tmp2 = _mm256_sqrt_ps(tmp1);
          phase_Val = _mm256_div_ps(phase_Val, tmp2);
      }
-    for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
+    for (i = 0; i < fourthPoints % ROTATOR_RELOAD; ++i) {
          aVal = _mm256_loadu_ps((float*)aPtr);
  
          yl = _mm256_moveldup_ps(phase_Val);
@@ -734,13 +793,12 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVecto
      }
  
      _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
-    for(i = 0; i < num_points%4; ++i) {
+    for (i = 0; i < num_points % 4; ++i) {
          *cPtr++ = *aPtr++ * phase_Ptr[0];
          phase_Ptr[0] *= (phase_inc);
      }
  
      (*phase) = phase_Ptr[0];
-
  }
  
  #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
diff --git a/kernels/volk/volk_32fc_x2_add_32fc.h b/kernels/volk/volk_32fc_x2_add_32fc.h

index 90ff78731bde456fd1ab90efaab6161f0c99f1ff..e7356c390f9f430b3a645de6fca904dc8cba6909 100644 (file)
--- a/kernels/volk/volk_32fc_x2_add_32fc.h
+++ b/kernels/volk/volk_32fc_x2_add_32fc.h
@@ -31,8 +31,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_x2_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points)
- * \endcode
+ * void volk_32fc_x2_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
+ * lv_32fc_t* bVector, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li aVector: First vector of input points.
@@ -44,7 +44,8 @@
   *
   * \b Example
   *
- * The follow example adds the increasing and decreasing vectors such that the result of every summation pair is 10
+ * The follow example adds the increasing and decreasing vectors such that the result of
+ * every summation pair is 10
   *
   * \code
   *   int N = 10;
@@ -76,36 +77,38 @@
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32fc_x2_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                          const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_add_32fc_u_avx(lv_32fc_t* cVector,
+                                               const lv_32fc_t* aVector,
+                                               const lv_32fc_t* bVector,
+                                               unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  const lv_32fc_t* bPtr=  bVector;
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
  
-  __m256 aVal, bVal, cVal;
-  for(;number < quarterPoints; number++){
+    __m256 aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
  
-    aVal = _mm256_loadu_ps((float *) aPtr);
-    bVal = _mm256_loadu_ps((float *) bPtr);
+        aVal = _mm256_loadu_ps((float*)aPtr);
+        bVal = _mm256_loadu_ps((float*)bPtr);
  
-    cVal = _mm256_add_ps(aVal, bVal);
+        cVal = _mm256_add_ps(aVal, bVal);
  
-    _mm256_storeu_ps((float *) cPtr,cVal); // Store the results back into the C container
+        _mm256_storeu_ps((float*)cPtr,
+                         cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) + (*bPtr++);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
@@ -113,36 +116,38 @@ volk_32fc_x2_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_32fc_x2_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                          const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_add_32fc_a_avx(lv_32fc_t* cVector,
+                                               const lv_32fc_t* aVector,
+                                               const lv_32fc_t* bVector,
+                                               unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  const lv_32fc_t* bPtr=  bVector;
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
  
-  __m256 aVal, bVal, cVal;
-  for(;number < quarterPoints; number++){
+    __m256 aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
  
-    aVal = _mm256_load_ps((float*) aPtr);
-    bVal = _mm256_load_ps((float*) bPtr);
+        aVal = _mm256_load_ps((float*)aPtr);
+        bVal = _mm256_load_ps((float*)bPtr);
  
-    cVal = _mm256_add_ps(aVal, bVal);
+        cVal = _mm256_add_ps(aVal, bVal);
  
-    _mm256_store_ps((float*) cPtr,cVal); // Store the results back into the C container
+        _mm256_store_ps((float*)cPtr,
+                        cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) + (*bPtr++);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
@@ -150,54 +155,56 @@ volk_32fc_x2_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32fc_x2_add_32fc_u_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                          const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_add_32fc_u_sse(lv_32fc_t* cVector,
+                                               const lv_32fc_t* aVector,
+                                               const lv_32fc_t* bVector,
+                                               unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int halfPoints = num_points / 2;
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
  
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  const lv_32fc_t* bPtr=  bVector;
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
  
-  __m128 aVal, bVal, cVal;
-  for(;number < halfPoints; number++){
+    __m128 aVal, bVal, cVal;
+    for (; number < halfPoints; number++) {
  
-    aVal = _mm_loadu_ps((float *) aPtr);
-    bVal = _mm_loadu_ps((float *) bPtr);
+        aVal = _mm_loadu_ps((float*)aPtr);
+        bVal = _mm_loadu_ps((float*)bPtr);
  
-    cVal = _mm_add_ps(aVal, bVal);
+        cVal = _mm_add_ps(aVal, bVal);
  
-    _mm_storeu_ps((float*) cPtr, cVal); // Store the results back into the C container
+        _mm_storeu_ps((float*)cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 2;
-    bPtr += 2;
-    cPtr += 2;
-  }
+        aPtr += 2;
+        bPtr += 2;
+        cPtr += 2;
+    }
  
-  number = halfPoints * 2;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) + (*bPtr++);
-  }
+    number = halfPoints * 2;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32fc_x2_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                            const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_add_32fc_generic(lv_32fc_t* cVector,
+                                                 const lv_32fc_t* aVector,
+                                                 const lv_32fc_t* bVector,
+                                                 unsigned int num_points)
  {
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  const lv_32fc_t* bPtr=  bVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = (*aPtr++) + (*bPtr++);
-  }
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -205,34 +212,36 @@ volk_32fc_x2_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32fc_x2_add_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_add_32fc_a_sse(lv_32fc_t* cVector,
+                                               const lv_32fc_t* aVector,
+                                               const lv_32fc_t* bVector,
+                                               unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int halfPoints = num_points / 2;
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
  
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  const lv_32fc_t* bPtr=  bVector;
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
  
-  __m128 aVal, bVal, cVal;
-  for(;number < halfPoints; number++){
-    aVal = _mm_load_ps((float *) aPtr);
-    bVal = _mm_load_ps((float *) bPtr);
+    __m128 aVal, bVal, cVal;
+    for (; number < halfPoints; number++) {
+        aVal = _mm_load_ps((float*)aPtr);
+        bVal = _mm_load_ps((float*)bPtr);
  
-    cVal = _mm_add_ps(aVal, bVal);
+        cVal = _mm_add_ps(aVal, bVal);
  
-    _mm_store_ps((float *) cPtr,cVal); // Store the results back into the C container
+        _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 2;
-    bPtr += 2;
-    cPtr += 2;
-  }
+        aPtr += 2;
+        bPtr += 2;
+        cPtr += 2;
+    }
  
-  number = halfPoints * 2;
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) + (*bPtr++);
-  }
+    number = halfPoints * 2;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_SSE */
  
@@ -240,38 +249,39 @@ volk_32fc_x2_add_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_32fc_x2_add_32fc_u_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                           const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_add_32fc_u_neon(lv_32fc_t* cVector,
+                                                const lv_32fc_t* aVector,
+                                                const lv_32fc_t* bVector,
+                                                unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int halfPoints = num_points / 2;
-
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  const lv_32fc_t* bPtr=  bVector;
-  float32x4_t aVal, bVal, cVal;
-  for(number=0; number < halfPoints; number++){
-    // Load in to NEON registers
-    aVal = vld1q_f32((const float32_t*)(aPtr));
-    bVal = vld1q_f32((const float32_t*)(bPtr));
-    __VOLK_PREFETCH(aPtr+2);
-    __VOLK_PREFETCH(bPtr+2);
-
-    // vector add
-    cVal = vaddq_f32(aVal, bVal);
-    // Store the results back into the C container
-    vst1q_f32((float*)(cPtr),cVal);
-
-    aPtr += 2; // q uses quadwords, 4 lv_32fc_ts per vadd
-    bPtr += 2;
-    cPtr += 2;
-  }
-
-  number = halfPoints * 2; // should be = num_points
-  for(;number < num_points; number++){
-    *cPtr++ = (*aPtr++) + (*bPtr++);
-  }
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
+    float32x4_t aVal, bVal, cVal;
+    for (number = 0; number < halfPoints; number++) {
+        // Load in to NEON registers
+        aVal = vld1q_f32((const float32_t*)(aPtr));
+        bVal = vld1q_f32((const float32_t*)(bPtr));
+        __VOLK_PREFETCH(aPtr + 2);
+        __VOLK_PREFETCH(bPtr + 2);
+
+        // vector add
+        cVal = vaddq_f32(aVal, bVal);
+        // Store the results back into the C container
+        vst1q_f32((float*)(cPtr), cVal);
+
+        aPtr += 2; // q uses quadwords, 4 lv_32fc_ts per vadd
+        bPtr += 2;
+        cPtr += 2;
+    }
+
+    number = halfPoints * 2; // should be = num_points
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_NEON */
diff --git a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h

index 77432ec9c7ed874d20c64875e5d382dbc53173f5..0f694994c981d70984d4f25586d5f09480c877c9 100644 (file)
--- a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
+++ b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
@@ -34,8 +34,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_x2_conjugate_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
- * \endcode
+ * void volk_32fc_x2_conjugate_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input,
+ * const lv_32fc_t* taps, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li input: vector of complex floats.
@@ -60,40 +60,44 @@
  #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
  
  
-#include<volk/volk_complex.h>
+#include <volk/volk_complex.h>
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* result,
+                                                                const lv_32fc_t* input,
+                                                                const lv_32fc_t* taps,
+                                                                unsigned int num_points)
+{
  
-  const unsigned int num_bytes = num_points*8;
+    const unsigned int num_bytes = num_points * 8;
  
-  float * res = (float*) result;
-  float * in = (float*) input;
-  float * tp = (float*) taps;
-  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+    float* res = (float*)result;
+    float* in = (float*)input;
+    float* tp = (float*)taps;
+    unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
  
-  float sum0[2] = {0,0};
-  float sum1[2] = {0,0};
-  unsigned int i = 0;
+    float sum0[2] = { 0, 0 };
+    float sum1[2] = { 0, 0 };
+    unsigned int i = 0;
  
-  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
-    sum0[0] += in[0] * tp[0] + in[1] * tp[1];
-    sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
-    sum1[0] += in[2] * tp[2] + in[3] * tp[3];
-    sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
+    for (i = 0; i < n_2_ccomplex_blocks; ++i) {
+        sum0[0] += in[0] * tp[0] + in[1] * tp[1];
+        sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
+        sum1[0] += in[2] * tp[2] + in[3] * tp[3];
+        sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
  
-    in += 4;
-    tp += 4;
-  }
+        in += 4;
+        tp += 4;
+    }
  
-  res[0] = sum0[0] + sum1[0];
-  res[1] = sum0[1] + sum1[1];
+    res[0] = sum0[0] + sum1[0];
+    res[1] = sum0[1] + sum1[1];
  
-  if (num_bytes >> 3 & 1) {
-    *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
-  }
+    if (num_bytes >> 3 & 1) {
+        *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
+    }
  }
  
  #endif /*LV_HAVE_GENERIC*/
@@ -103,125 +107,134 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* resul
  #include <immintrin.h>
  
  static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_avx(lv_32fc_t* result,
-    const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
+                                                              const lv_32fc_t* input,
+                                                              const lv_32fc_t* taps,
+                                                              unsigned int num_points)
  {
-  // Partial sums for indices i, i+1, i+2 and i+3.
-  __m256 sum_a_mult_b_real = _mm256_setzero_ps();
-  __m256 sum_a_mult_b_imag = _mm256_setzero_ps();
-
-  for (long unsigned i = 0; i < (num_points & ~3u); i += 4) {
-    /* Four complex elements a time are processed.
-     * (ar + j⋅ai)*conj(br + j⋅bi) =
-     * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
-     */
+    // Partial sums for indices i, i+1, i+2 and i+3.
+    __m256 sum_a_mult_b_real = _mm256_setzero_ps();
+    __m256 sum_a_mult_b_imag = _mm256_setzero_ps();
+
+    for (long unsigned i = 0; i < (num_points & ~3u); i += 4) {
+        /* Four complex elements a time are processed.
+         * (ar + j⋅ai)*conj(br + j⋅bi) =
+         * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
+         */
+
+        /* Load input and taps, split and duplicate real und imaginary parts of taps.
+         * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
+         * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
+         * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
+         * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
+         */
+        __m256 a = _mm256_loadu_ps((const float*)&input[i]);
+        __m256 b = _mm256_loadu_ps((const float*)&taps[i]);
+        __m256 b_real = _mm256_moveldup_ps(b);
+        __m256 b_imag = _mm256_movehdup_ps(b);
+
+        // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
+        sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real));
+        // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
+        sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag));
+    }
  
-    /* Load input and taps, split and duplicate real und imaginary parts of taps.
-     * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
-     * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
-     * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
-     * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
+    // Swap position of −ar⋅bi and ai⋅bi.
+    sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
+    // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums.
+    __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
+    /* Sum the four partial sums: Add high half of vector sum to the low one, i.e.
+     * s1 + s3 and s0 + s2 …
       */
-    __m256 a = _mm256_loadu_ps((const float *) &input[i]);
-    __m256 b = _mm256_loadu_ps((const float *) &taps[i]);
-    __m256 b_real = _mm256_moveldup_ps(b);
-    __m256 b_imag = _mm256_movehdup_ps(b);
-
-    // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
-    sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real));
-    // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
-    sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag));
-  }
-
-  // Swap position of −ar⋅bi and ai⋅bi.
-  sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
-  // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums.
-  __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
-  /* Sum the four partial sums: Add high half of vector sum to the low one, i.e.
-   * s1 + s3 and s0 + s2 …
-   */
-  sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01));
-  // … and now (s0 + s2) + (s1 + s3)
-  sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2)));
-  // Store result.
-  __m128 lower = _mm256_extractf128_ps(sum, 0);
-  _mm_storel_pi((__m64 *) result, lower);
-
-  // Handle the last elements if num_points mod 4 is bigger than 0.
-  for (long unsigned i = num_points & ~3u; i < num_points; ++i) {
-    *result += lv_cmake(
-        lv_creal(input[i]) * lv_creal(taps[i]) + lv_cimag(input[i]) * lv_cimag(taps[i]),
-        lv_cimag(input[i]) * lv_creal(taps[i]) - lv_creal(input[i]) * lv_cimag(taps[i]));
-  }
+    sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01));
+    // … and now (s0 + s2) + (s1 + s3)
+    sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2)));
+    // Store result.
+    __m128 lower = _mm256_extractf128_ps(sum, 0);
+    _mm_storel_pi((__m64*)result, lower);
+
+    // Handle the last elements if num_points mod 4 is bigger than 0.
+    for (long unsigned i = num_points & ~3u; i < num_points; ++i) {
+        *result += lv_cmake(lv_creal(input[i]) * lv_creal(taps[i]) +
+                                lv_cimag(input[i]) * lv_cimag(taps[i]),
+                            lv_cimag(input[i]) * lv_creal(taps[i]) -
+                                lv_creal(input[i]) * lv_cimag(taps[i]));
+    }
  }
  
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_SSE3
  
-#include <xmmintrin.h>
  #include <pmmintrin.h>
+#include <xmmintrin.h>
  
  static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result,
-    const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
+                                                               const lv_32fc_t* input,
+                                                               const lv_32fc_t* taps,
+                                                               unsigned int num_points)
  {
-  // Partial sums for indices i and i+1.
-  __m128 sum_a_mult_b_real = _mm_setzero_ps();
-  __m128 sum_a_mult_b_imag = _mm_setzero_ps();
-
-  for (long unsigned i = 0; i < (num_points & ~1u); i += 2) {
-    /* Two complex elements a time are processed.
-     * (ar + j⋅ai)*conj(br + j⋅bi) =
-     * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
-     */
+    // Partial sums for indices i and i+1.
+    __m128 sum_a_mult_b_real = _mm_setzero_ps();
+    __m128 sum_a_mult_b_imag = _mm_setzero_ps();
+
+    for (long unsigned i = 0; i < (num_points & ~1u); i += 2) {
+        /* Two complex elements a time are processed.
+         * (ar + j⋅ai)*conj(br + j⋅bi) =
+         * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
+         */
+
+        /* Load input and taps, split and duplicate real und imaginary parts of taps.
+         * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
+         * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
+         * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
+         * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
+         */
+        __m128 a = _mm_loadu_ps((const float*)&input[i]);
+        __m128 b = _mm_loadu_ps((const float*)&taps[i]);
+        __m128 b_real = _mm_moveldup_ps(b);
+        __m128 b_imag = _mm_movehdup_ps(b);
+
+        // Add | ai⋅br,i+1 | ar⋅br,i+1 | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
+        sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real));
+        // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
+        sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag));
+    }
  
-    /* Load input and taps, split and duplicate real und imaginary parts of taps.
-     * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
-     * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
-     * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
-     * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
-     */
-    __m128 a = _mm_loadu_ps((const float *) &input[i]);
-    __m128 b = _mm_loadu_ps((const float *) &taps[i]);
-    __m128 b_real = _mm_moveldup_ps(b);
-    __m128 b_imag = _mm_movehdup_ps(b);
-
-    // Add | ai⋅br,i+1 | ar⋅br,i+1 | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
-    sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real));
-    // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
-    sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag));
-  }
-
-  // Swap position of −ar⋅bi and ai⋅bi.
-  sum_a_mult_b_imag = _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag,
-      _MM_SHUFFLE(2, 3, 0, 1));
-  // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums.
-  __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
-  // Sum the two partial sums.
-  sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2)));
-  // Store result.
-  _mm_storel_pi((__m64 *) result, sum);
-
-  // Handle the last element if num_points mod 2 is 1.
-  if (num_points & 1u) {
-    *result += lv_cmake(
-        lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) +
-        lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]),
-        lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) -
-        lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1]));
-  }
+    // Swap position of −ar⋅bi and ai⋅bi.
+    sum_a_mult_b_imag =
+        _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
+    // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums.
+    __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
+    // Sum the two partial sums.
+    sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2)));
+    // Store result.
+    _mm_storel_pi((__m64*)result, sum);
+
+    // Handle the last element if num_points mod 2 is 1.
+    if (num_points & 1u) {
+        *result += lv_cmake(
+            lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) +
+                lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]),
+            lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) -
+                lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1]));
+    }
  }
  
  #endif /*LV_HAVE_SSE3*/
  
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result,
+                                                             const lv_32fc_t* input,
+                                                             const lv_32fc_t* taps,
+                                                             unsigned int num_points)
+{
  
      unsigned int quarter_points = num_points / 4;
      unsigned int number;
  
-    lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
-    lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+    lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
+    lv_32fc_t* b_ptr = (lv_32fc_t*)input;
      // for 2-lane vectors, 1st lane holds the real part,
      // 2nd lane holds the imaginary part
      float32x4x2_t a_val, b_val, accumulator;
@@ -229,11 +242,11 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result,
      accumulator.val[0] = vdupq_n_f32(0);
      accumulator.val[1] = vdupq_n_f32(0);
  
-    for(number = 0; number < quarter_points; ++number) {
+    for (number = 0; number < quarter_points; ++number) {
          a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
          b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-        __VOLK_PREFETCH(a_ptr+8);
-        __VOLK_PREFETCH(b_ptr+8);
+        __VOLK_PREFETCH(a_ptr + 8);
+        __VOLK_PREFETCH(b_ptr + 8);
  
          // do the first multiply
          tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
@@ -255,11 +268,10 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result,
      *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
  
      // tail case
-    for(number = quarter_points*4; number < num_points; ++number) {
-      *result += (*a_ptr++) * lv_conj(*b_ptr++);
+    for (number = quarter_points * 4; number < num_points; ++number) {
+        *result += (*a_ptr++) * lv_conj(*b_ptr++);
      }
      *result = lv_conj(*result);
-
  }
  #endif /*LV_HAVE_NEON*/
  
@@ -268,120 +280,125 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result,
  #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
  #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
  
+#include <stdio.h>
  #include <volk/volk_common.h>
-#include<volk/volk_complex.h>
-#include<stdio.h>
+#include <volk/volk_complex.h>
  
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
  static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_avx(lv_32fc_t* result,
-    const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
+                                                              const lv_32fc_t* input,
+                                                              const lv_32fc_t* taps,
+                                                              unsigned int num_points)
  {
-  // Partial sums for indices i, i+1, i+2 and i+3.
-  __m256 sum_a_mult_b_real = _mm256_setzero_ps();
-  __m256 sum_a_mult_b_imag = _mm256_setzero_ps();
-
-  for (long unsigned i = 0; i < (num_points & ~3u); i += 4) {
-    /* Four complex elements a time are processed.
-     * (ar + j⋅ai)*conj(br + j⋅bi) =
-     * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
-     */
+    // Partial sums for indices i, i+1, i+2 and i+3.
+    __m256 sum_a_mult_b_real = _mm256_setzero_ps();
+    __m256 sum_a_mult_b_imag = _mm256_setzero_ps();
+
+    for (long unsigned i = 0; i < (num_points & ~3u); i += 4) {
+        /* Four complex elements a time are processed.
+         * (ar + j⋅ai)*conj(br + j⋅bi) =
+         * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
+         */
+
+        /* Load input and taps, split and duplicate real und imaginary parts of taps.
+         * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
+         * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
+         * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
+         * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
+         */
+        __m256 a = _mm256_load_ps((const float*)&input[i]);
+        __m256 b = _mm256_load_ps((const float*)&taps[i]);
+        __m256 b_real = _mm256_moveldup_ps(b);
+        __m256 b_imag = _mm256_movehdup_ps(b);
+
+        // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
+        sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real));
+        // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
+        sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag));
+    }
  
-    /* Load input and taps, split and duplicate real und imaginary parts of taps.
-     * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
-     * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
-     * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
-     * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
+    // Swap position of −ar⋅bi and ai⋅bi.
+    sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
+    // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums.
+    __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
+    /* Sum the four partial sums: Add high half of vector sum to the low one, i.e.
+     * s1 + s3 and s0 + s2 …
       */
-    __m256 a = _mm256_load_ps((const float *) &input[i]);
-    __m256 b = _mm256_load_ps((const float *) &taps[i]);
-    __m256 b_real = _mm256_moveldup_ps(b);
-    __m256 b_imag = _mm256_movehdup_ps(b);
-
-    // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
-    sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real));
-    // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
-    sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag));
-  }
-
-  // Swap position of −ar⋅bi and ai⋅bi.
-  sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
-  // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums.
-  __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
-  /* Sum the four partial sums: Add high half of vector sum to the low one, i.e.
-   * s1 + s3 and s0 + s2 …
-   */
-  sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01));
-  // … and now (s0 + s2) + (s1 + s3)
-  sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2)));
-  // Store result.
-  __m128 lower = _mm256_extractf128_ps(sum, 0);
-  _mm_storel_pi((__m64 *) result, lower);
-
-  // Handle the last elements if num_points mod 4 is bigger than 0.
-  for (long unsigned i = num_points & ~3u; i < num_points; ++i) {
-    *result += lv_cmake(
-        lv_creal(input[i]) * lv_creal(taps[i]) + lv_cimag(input[i]) * lv_cimag(taps[i]),
-        lv_cimag(input[i]) * lv_creal(taps[i]) - lv_creal(input[i]) * lv_cimag(taps[i]));
-  }
+    sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01));
+    // … and now (s0 + s2) + (s1 + s3)
+    sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2)));
+    // Store result.
+    __m128 lower = _mm256_extractf128_ps(sum, 0);
+    _mm_storel_pi((__m64*)result, lower);
+
+    // Handle the last elements if num_points mod 4 is bigger than 0.
+    for (long unsigned i = num_points & ~3u; i < num_points; ++i) {
+        *result += lv_cmake(lv_creal(input[i]) * lv_creal(taps[i]) +
+                                lv_cimag(input[i]) * lv_cimag(taps[i]),
+                            lv_cimag(input[i]) * lv_creal(taps[i]) -
+                                lv_creal(input[i]) * lv_cimag(taps[i]));
+    }
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_SSE3
  
-#include <xmmintrin.h>
  #include <pmmintrin.h>
+#include <xmmintrin.h>
  
  static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse3(lv_32fc_t* result,
-    const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
+                                                               const lv_32fc_t* input,
+                                                               const lv_32fc_t* taps,
+                                                               unsigned int num_points)
  {
-  // Partial sums for indices i and i+1.
-  __m128 sum_a_mult_b_real = _mm_setzero_ps();
-  __m128 sum_a_mult_b_imag = _mm_setzero_ps();
-
-  for (long unsigned i = 0; i < (num_points & ~1u); i += 2) {
-    /* Two complex elements a time are processed.
-     * (ar + j⋅ai)*conj(br + j⋅bi) =
-     * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
-     */
+    // Partial sums for indices i and i+1.
+    __m128 sum_a_mult_b_real = _mm_setzero_ps();
+    __m128 sum_a_mult_b_imag = _mm_setzero_ps();
+
+    for (long unsigned i = 0; i < (num_points & ~1u); i += 2) {
+        /* Two complex elements a time are processed.
+         * (ar + j⋅ai)*conj(br + j⋅bi) =
+         * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
+         */
+
+        /* Load input and taps, split and duplicate real und imaginary parts of taps.
+         * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
+         * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
+         * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
+         * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
+         */
+        __m128 a = _mm_load_ps((const float*)&input[i]);
+        __m128 b = _mm_load_ps((const float*)&taps[i]);
+        __m128 b_real = _mm_moveldup_ps(b);
+        __m128 b_imag = _mm_movehdup_ps(b);
+
+        // Add | ai⋅br,i+1 | ar⋅br,i+1 | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
+        sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real));
+        // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
+        sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag));
+    }
  
-    /* Load input and taps, split and duplicate real und imaginary parts of taps.
-     * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
-     * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
-     * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
-     * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
-     */
-    __m128 a = _mm_load_ps((const float *) &input[i]);
-    __m128 b = _mm_load_ps((const float *) &taps[i]);
-    __m128 b_real = _mm_moveldup_ps(b);
-    __m128 b_imag = _mm_movehdup_ps(b);
-
-    // Add | ai⋅br,i+1 | ar⋅br,i+1 | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
-    sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real));
-    // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
-    sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag));
-  }
-
-  // Swap position of −ar⋅bi and ai⋅bi.
-  sum_a_mult_b_imag = _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag,
-      _MM_SHUFFLE(2, 3, 0, 1));
-  // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums.
-  __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
-  // Sum the two partial sums.
-  sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2)));
-  // Store result.
-  _mm_storel_pi((__m64 *) result, sum);
-
-  // Handle the last element if num_points mod 2 is 1.
-  if (num_points & 1u) {
-    *result += lv_cmake(
-        lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) +
-        lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]),
-        lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) -
-        lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1]));
-  }
+    // Swap position of −ar⋅bi and ai⋅bi.
+    sum_a_mult_b_imag =
+        _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
+    // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums.
+    __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
+    // Sum the two partial sums.
+    sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2)));
+    // Store result.
+    _mm_storel_pi((__m64*)result, sum);
+
+    // Handle the last element if num_points mod 2 is 1.
+    if (num_points & 1u) {
+        *result += lv_cmake(
+            lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) +
+                lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]),
+            lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) -
+                lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1]));
+    }
  }
  
  #endif /*LV_HAVE_SSE3*/
@@ -390,35 +407,39 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse3(lv_32fc_t* result
  #ifdef LV_HAVE_GENERIC
  
  
-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result,
+                                                                  const lv_32fc_t* input,
+                                                                  const lv_32fc_t* taps,
+                                                                  unsigned int num_points)
+{
  
-  const unsigned int num_bytes = num_points*8;
+    const unsigned int num_bytes = num_points * 8;
  
-  float * res = (float*) result;
-  float * in = (float*) input;
-  float * tp = (float*) taps;
-  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+    float* res = (float*)result;
+    float* in = (float*)input;
+    float* tp = (float*)taps;
+    unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
  
-  float sum0[2] = {0,0};
-  float sum1[2] = {0,0};
-  unsigned int i = 0;
+    float sum0[2] = { 0, 0 };
+    float sum1[2] = { 0, 0 };
+    unsigned int i = 0;
  
-  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
-    sum0[0] += in[0] * tp[0] + in[1] * tp[1];
-    sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
-    sum1[0] += in[2] * tp[2] + in[3] * tp[3];
-    sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
+    for (i = 0; i < n_2_ccomplex_blocks; ++i) {
+        sum0[0] += in[0] * tp[0] + in[1] * tp[1];
+        sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
+        sum1[0] += in[2] * tp[2] + in[3] * tp[3];
+        sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
  
-    in += 4;
-    tp += 4;
-  }
+        in += 4;
+        tp += 4;
+    }
  
-  res[0] = sum0[0] + sum1[0];
-  res[1] = sum0[1] + sum1[1];
+    res[0] = sum0[0] + sum1[0];
+    res[1] = sum0[1] + sum1[1];
  
-  if (num_bytes >> 3 & 1) {
-    *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
-  }
+    if (num_bytes >> 3 & 1) {
+        *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
+    }
  }
  
  #endif /*LV_HAVE_GENERIC*/
@@ -426,256 +447,276 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* res
  
  #if LV_HAVE_SSE && LV_HAVE_64
  
-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
-
-  const unsigned int num_bytes = num_points*8;
-
-  __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
-
-  __VOLK_ASM __VOLK_VOLATILE
-    (
-     "#  ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
-     "#                         const float *taps, unsigned num_bytes)\n\t"
-     "#    float sum0 = 0;\n\t"
-     "#    float sum1 = 0;\n\t"
-     "#    float sum2 = 0;\n\t"
-     "#    float sum3 = 0;\n\t"
-     "#    do {\n\t"
-     "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
-     "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
-     "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
-     "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
-     "#      input += 4;\n\t"
-     "#      taps += 4;  \n\t"
-     "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
-     "#    result[0] = sum0 + sum2;\n\t"
-     "#    result[1] = sum1 + sum3;\n\t"
-     "# TODO: prefetch and better scheduling\n\t"
-     "  xor    %%r9,  %%r9\n\t"
-     "  xor    %%r10, %%r10\n\t"
-     "  movq   %[conjugator], %%r9\n\t"
-     "  movq   %%rcx, %%rax\n\t"
-     "  movaps 0(%%r9), %%xmm8\n\t"
-     "  movq   %%rcx, %%r8\n\t"
-     "  movq   %[rsi],  %%r9\n\t"
-     "  movq   %[rdx], %%r10\n\t"
-     " xorps   %%xmm6, %%xmm6          # zero accumulators\n\t"
-     " movaps  0(%%r9), %%xmm0\n\t"
-     " xorps   %%xmm7, %%xmm7          # zero accumulators\n\t"
-     " movups  0(%%r10), %%xmm2\n\t"
-     " shr     $5, %%rax               # rax = n_2_ccomplex_blocks / 2\n\t"
-     "  shr     $4, %%r8\n\t"
-     "  xorps  %%xmm8, %%xmm2\n\t"
-     " jmp     .%=L1_test\n\t"
-     " # 4 taps / loop\n\t"
-     " # something like ?? cycles / loop\n\t"
-     ".%=Loop1:        \n\t"
-     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
-     "#        movaps  (%%r9), %%xmmA\n\t"
-     "#        movaps  (%%r10), %%xmmB\n\t"
-     "#        movaps  %%xmmA, %%xmmZ\n\t"
-     "#        shufps  $0xb1, %%xmmZ, %%xmmZ   # swap internals\n\t"
-     "#        mulps   %%xmmB, %%xmmA\n\t"
-     "#        mulps   %%xmmZ, %%xmmB\n\t"
-     "#        # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
-     "#        xorps   %%xmmPN, %%xmmA\n\t"
-     "#        movaps  %%xmmA, %%xmmZ\n\t"
-     "#        unpcklps %%xmmB, %%xmmA\n\t"
-     "#        unpckhps %%xmmB, %%xmmZ\n\t"
-     "#        movaps  %%xmmZ, %%xmmY\n\t"
-     "#        shufps  $0x44, %%xmmA, %%xmmZ   # b01000100\n\t"
-     "#        shufps  $0xee, %%xmmY, %%xmmA   # b11101110\n\t"
-     "#        addps   %%xmmZ, %%xmmA\n\t"
-     "#        addps   %%xmmA, %%xmmC\n\t"
-     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
-     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
-     " movaps  16(%%r9), %%xmm1\n\t"
-     " movaps  %%xmm0, %%xmm4\n\t"
-     " mulps   %%xmm2, %%xmm0\n\t"
-     " shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
-     " movaps  16(%%r10), %%xmm3\n\t"
-     " movaps  %%xmm1, %%xmm5\n\t"
-     "  xorps   %%xmm8, %%xmm3\n\t"
-     " addps   %%xmm0, %%xmm6\n\t"
-     " mulps   %%xmm3, %%xmm1\n\t"
-     " shufps  $0xb1, %%xmm5, %%xmm5   # swap internals\n\t"
-     " addps   %%xmm1, %%xmm6\n\t"
-     " mulps   %%xmm4, %%xmm2\n\t"
-     " movaps  32(%%r9), %%xmm0\n\t"
-     " addps   %%xmm2, %%xmm7\n\t"
-     " mulps   %%xmm5, %%xmm3\n\t"
-     " add     $32, %%r9\n\t"
-     " movaps  32(%%r10), %%xmm2\n\t"
-     " addps   %%xmm3, %%xmm7\n\t"
-     " add     $32, %%r10\n\t"
-     "  xorps   %%xmm8, %%xmm2\n\t"
-     ".%=L1_test:\n\t"
-     " dec     %%rax\n\t"
-     " jge     .%=Loop1\n\t"
-     " # We've handled the bulk of multiplies up to here.\n\t"
-     " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
-     " # If so, we've got 2 more taps to do.\n\t"
-     " and     $1, %%r8\n\t"
-     " je      .%=Leven\n\t"
-     " # The count was odd, do 2 more taps.\n\t"
-     " # Note that we've already got mm0/mm2 preloaded\n\t"
-     " # from the main loop.\n\t"
-     " movaps  %%xmm0, %%xmm4\n\t"
-     " mulps   %%xmm2, %%xmm0\n\t"
-     " shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
-     " addps   %%xmm0, %%xmm6\n\t"
-     " mulps   %%xmm4, %%xmm2\n\t"
-     " addps   %%xmm2, %%xmm7\n\t"
-     ".%=Leven:\n\t"
-     " # neg inversor\n\t"
-     " xorps   %%xmm1, %%xmm1\n\t"
-     " mov     $0x80000000, %%r9\n\t"
-     " movd    %%r9, %%xmm1\n\t"
-     " shufps  $0x11, %%xmm1, %%xmm1   # b00010001 # 0 -0 0 -0\n\t"
-     " # pfpnacc\n\t"
-     " xorps   %%xmm1, %%xmm6\n\t"
-     " movaps  %%xmm6, %%xmm2\n\t"
-     " unpcklps %%xmm7, %%xmm6\n\t"
-     " unpckhps %%xmm7, %%xmm2\n\t"
-     " movaps  %%xmm2, %%xmm3\n\t"
-     " shufps  $0x44, %%xmm6, %%xmm2   # b01000100\n\t"
-     " shufps  $0xee, %%xmm3, %%xmm6   # b11101110\n\t"
-     " addps   %%xmm2, %%xmm6\n\t"
-     "                                 # xmm6 = r1 i2 r3 i4\n\t"
-     " movhlps %%xmm6, %%xmm4          # xmm4 = r3 i4 ?? ??\n\t"
-     " addps   %%xmm4, %%xmm6          # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
-     " movlps  %%xmm6, (%[rdi])                # store low 2x32 bits (complex) to memory\n\t"
-     :
-     :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator)
-     :"rax", "r8", "r9", "r10"
-     );
-
-  int getem = num_bytes % 16;
-
-  for(; getem > 0; getem -= 8) {
-    *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
-  }
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result,
+                                                              const lv_32fc_t* input,
+                                                              const lv_32fc_t* taps,
+                                                              unsigned int num_points)
+{
+
+    const unsigned int num_bytes = num_points * 8;
+
+    __VOLK_ATTR_ALIGNED(16)
+    static const uint32_t conjugator[4] = {
+        0x00000000, 0x80000000, 0x00000000, 0x80000000
+    };
+
+    __VOLK_ASM __VOLK_VOLATILE(
+        "#  ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
+        "#                         const float *taps, unsigned num_bytes)\n\t"
+        "#    float sum0 = 0;\n\t"
+        "#    float sum1 = 0;\n\t"
+        "#    float sum2 = 0;\n\t"
+        "#    float sum3 = 0;\n\t"
+        "#    do {\n\t"
+        "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+        "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+        "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+        "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+        "#      input += 4;\n\t"
+        "#      taps += 4;  \n\t"
+        "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
+        "#    result[0] = sum0 + sum2;\n\t"
+        "#    result[1] = sum1 + sum3;\n\t"
+        "# TODO: prefetch and better scheduling\n\t"
+        "  xor    %%r9,  %%r9\n\t"
+        "  xor    %%r10, %%r10\n\t"
+        "  movq   %[conjugator], %%r9\n\t"
+        "  movq   %%rcx, %%rax\n\t"
+        "  movaps 0(%%r9), %%xmm8\n\t"
+        "  movq   %%rcx, %%r8\n\t"
+        "  movq   %[rsi],  %%r9\n\t"
+        "  movq   %[rdx], %%r10\n\t"
+        "      xorps   %%xmm6, %%xmm6          # zero accumulators\n\t"
+        "      movaps  0(%%r9), %%xmm0\n\t"
+        "      xorps   %%xmm7, %%xmm7          # zero accumulators\n\t"
+        "      movups  0(%%r10), %%xmm2\n\t"
+        "      shr     $5, %%rax               # rax = n_2_ccomplex_blocks / 2\n\t"
+        "  shr     $4, %%r8\n\t"
+        "  xorps  %%xmm8, %%xmm2\n\t"
+        "      jmp     .%=L1_test\n\t"
+        "      # 4 taps / loop\n\t"
+        "      # something like ?? cycles / loop\n\t"
+        ".%=Loop1:     \n\t"
+        "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+        "#     movaps  (%%r9), %%xmmA\n\t"
+        "#     movaps  (%%r10), %%xmmB\n\t"
+        "#     movaps  %%xmmA, %%xmmZ\n\t"
+        "#     shufps  $0xb1, %%xmmZ, %%xmmZ   # swap internals\n\t"
+        "#     mulps   %%xmmB, %%xmmA\n\t"
+        "#     mulps   %%xmmZ, %%xmmB\n\t"
+        "#     # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+        "#     xorps   %%xmmPN, %%xmmA\n\t"
+        "#     movaps  %%xmmA, %%xmmZ\n\t"
+        "#     unpcklps %%xmmB, %%xmmA\n\t"
+        "#     unpckhps %%xmmB, %%xmmZ\n\t"
+        "#     movaps  %%xmmZ, %%xmmY\n\t"
+        "#     shufps  $0x44, %%xmmA, %%xmmZ   # b01000100\n\t"
+        "#     shufps  $0xee, %%xmmY, %%xmmA   # b11101110\n\t"
+        "#     addps   %%xmmZ, %%xmmA\n\t"
+        "#     addps   %%xmmA, %%xmmC\n\t"
+        "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+        "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+        "      movaps  16(%%r9), %%xmm1\n\t"
+        "      movaps  %%xmm0, %%xmm4\n\t"
+        "      mulps   %%xmm2, %%xmm0\n\t"
+        "      shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
+        "      movaps  16(%%r10), %%xmm3\n\t"
+        "      movaps  %%xmm1, %%xmm5\n\t"
+        "  xorps   %%xmm8, %%xmm3\n\t"
+        "      addps   %%xmm0, %%xmm6\n\t"
+        "      mulps   %%xmm3, %%xmm1\n\t"
+        "      shufps  $0xb1, %%xmm5, %%xmm5   # swap internals\n\t"
+        "      addps   %%xmm1, %%xmm6\n\t"
+        "      mulps   %%xmm4, %%xmm2\n\t"
+        "      movaps  32(%%r9), %%xmm0\n\t"
+        "      addps   %%xmm2, %%xmm7\n\t"
+        "      mulps   %%xmm5, %%xmm3\n\t"
+        "      add     $32, %%r9\n\t"
+        "      movaps  32(%%r10), %%xmm2\n\t"
+        "      addps   %%xmm3, %%xmm7\n\t"
+        "      add     $32, %%r10\n\t"
+        "  xorps   %%xmm8, %%xmm2\n\t"
+        ".%=L1_test:\n\t"
+        "      dec     %%rax\n\t"
+        "      jge     .%=Loop1\n\t"
+        "      # We've handled the bulk of multiplies up to here.\n\t"
+        "      # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+        "      # If so, we've got 2 more taps to do.\n\t"
+        "      and     $1, %%r8\n\t"
+        "      je      .%=Leven\n\t"
+        "      # The count was odd, do 2 more taps.\n\t"
+        "      # Note that we've already got mm0/mm2 preloaded\n\t"
+        "      # from the main loop.\n\t"
+        "      movaps  %%xmm0, %%xmm4\n\t"
+        "      mulps   %%xmm2, %%xmm0\n\t"
+        "      shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
+        "      addps   %%xmm0, %%xmm6\n\t"
+        "      mulps   %%xmm4, %%xmm2\n\t"
+        "      addps   %%xmm2, %%xmm7\n\t"
+        ".%=Leven:\n\t"
+        "      # neg inversor\n\t"
+        "      xorps   %%xmm1, %%xmm1\n\t"
+        "      mov     $0x80000000, %%r9\n\t"
+        "      movd    %%r9, %%xmm1\n\t"
+        "      shufps  $0x11, %%xmm1, %%xmm1   # b00010001 # 0 -0 0 -0\n\t"
+        "      # pfpnacc\n\t"
+        "      xorps   %%xmm1, %%xmm6\n\t"
+        "      movaps  %%xmm6, %%xmm2\n\t"
+        "      unpcklps %%xmm7, %%xmm6\n\t"
+        "      unpckhps %%xmm7, %%xmm2\n\t"
+        "      movaps  %%xmm2, %%xmm3\n\t"
+        "      shufps  $0x44, %%xmm6, %%xmm2   # b01000100\n\t"
+        "      shufps  $0xee, %%xmm3, %%xmm6   # b11101110\n\t"
+        "      addps   %%xmm2, %%xmm6\n\t"
+        "                                      # xmm6 = r1 i2 r3 i4\n\t"
+        "      movhlps %%xmm6, %%xmm4          # xmm4 = r3 i4 ?? ??\n\t"
+        "      addps   %%xmm4, %%xmm6          # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+        "      movlps  %%xmm6, (%[rdi])                # store low 2x32 bits (complex) "
+        "to memory\n\t"
+        :
+        : [rsi] "r"(input),
+          [rdx] "r"(taps),
+          "c"(num_bytes),
+          [rdi] "r"(result),
+          [conjugator] "r"(conjugator)
+        : "rax", "r8", "r9", "r10");
+
+    int getem = num_bytes % 16;
+
+    for (; getem > 0; getem -= 8) {
+        *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
+    }
  }
  #endif
  
  #if LV_HAVE_SSE && LV_HAVE_32
-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
-
-  const unsigned int num_bytes = num_points*8;
-
-  __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
-
-  int bound = num_bytes >> 4;
-  int leftovers = num_bytes % 16;
-
-  __VOLK_ASM __VOLK_VOLATILE
-    (
-     " #pushl  %%ebp\n\t"
-     " #movl   %%esp, %%ebp\n\t"
-     " #movl   12(%%ebp), %%eax                # input\n\t"
-     " #movl   16(%%ebp), %%edx                # taps\n\t"
-     " #movl   20(%%ebp), %%ecx                # n_bytes\n\t"
-     "  movaps  0(%[conjugator]), %%xmm1\n\t"
-     " xorps   %%xmm6, %%xmm6          # zero accumulators\n\t"
-     " movaps  0(%[eax]), %%xmm0\n\t"
-     " xorps   %%xmm7, %%xmm7          # zero accumulators\n\t"
-     " movaps  0(%[edx]), %%xmm2\n\t"
-     "  movl    %[ecx], (%[out])\n\t"
-     " shrl    $5, %[ecx]              # ecx = n_2_ccomplex_blocks / 2\n\t"
-
-     "  xorps   %%xmm1, %%xmm2\n\t"
-     " jmp     .%=L1_test\n\t"
-     " # 4 taps / loop\n\t"
-     " # something like ?? cycles / loop\n\t"
-     ".%=Loop1:        \n\t"
-     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
-     "#        movaps  (%[eax]), %%xmmA\n\t"
-     "#        movaps  (%[edx]), %%xmmB\n\t"
-     "#        movaps  %%xmmA, %%xmmZ\n\t"
-     "#        shufps  $0xb1, %%xmmZ, %%xmmZ   # swap internals\n\t"
-     "#        mulps   %%xmmB, %%xmmA\n\t"
-     "#        mulps   %%xmmZ, %%xmmB\n\t"
-     "#        # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
-     "#        xorps   %%xmmPN, %%xmmA\n\t"
-     "#        movaps  %%xmmA, %%xmmZ\n\t"
-     "#        unpcklps %%xmmB, %%xmmA\n\t"
-     "#        unpckhps %%xmmB, %%xmmZ\n\t"
-     "#        movaps  %%xmmZ, %%xmmY\n\t"
-     "#        shufps  $0x44, %%xmmA, %%xmmZ   # b01000100\n\t"
-     "#        shufps  $0xee, %%xmmY, %%xmmA   # b11101110\n\t"
-     "#        addps   %%xmmZ, %%xmmA\n\t"
-     "#        addps   %%xmmA, %%xmmC\n\t"
-     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
-     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
-     " movaps  16(%[edx]), %%xmm3\n\t"
-     " movaps  %%xmm0, %%xmm4\n\t"
-     "  xorps   %%xmm1, %%xmm3\n\t"
-     " mulps   %%xmm2, %%xmm0\n\t"
-     " movaps  16(%[eax]), %%xmm1\n\t"
-     " shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
-     " movaps  %%xmm1, %%xmm5\n\t"
-     " addps   %%xmm0, %%xmm6\n\t"
-     " mulps   %%xmm3, %%xmm1\n\t"
-     " shufps  $0xb1, %%xmm5, %%xmm5   # swap internals\n\t"
-     " addps   %%xmm1, %%xmm6\n\t"
-     "  movaps  0(%[conjugator]), %%xmm1\n\t"
-     " mulps   %%xmm4, %%xmm2\n\t"
-     " movaps  32(%[eax]), %%xmm0\n\t"
-     " addps   %%xmm2, %%xmm7\n\t"
-     " mulps   %%xmm5, %%xmm3\n\t"
-     " addl    $32, %[eax]\n\t"
-     " movaps  32(%[edx]), %%xmm2\n\t"
-     " addps   %%xmm3, %%xmm7\n\t"
-     "  xorps   %%xmm1, %%xmm2\n\t"
-     " addl    $32, %[edx]\n\t"
-     ".%=L1_test:\n\t"
-     " decl    %[ecx]\n\t"
-     " jge     .%=Loop1\n\t"
-     " # We've handled the bulk of multiplies up to here.\n\t"
-     " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
-     " # If so, we've got 2 more taps to do.\n\t"
-     " movl    0(%[out]), %[ecx]               # n_2_ccomplex_blocks\n\t"
-     "  shrl    $4, %[ecx]\n\t"
-     " andl    $1, %[ecx]\n\t"
-     " je      .%=Leven\n\t"
-     " # The count was odd, do 2 more taps.\n\t"
-     " # Note that we've already got mm0/mm2 preloaded\n\t"
-     " # from the main loop.\n\t"
-     " movaps  %%xmm0, %%xmm4\n\t"
-     " mulps   %%xmm2, %%xmm0\n\t"
-     " shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
-     " addps   %%xmm0, %%xmm6\n\t"
-     " mulps   %%xmm4, %%xmm2\n\t"
-     " addps   %%xmm2, %%xmm7\n\t"
-     ".%=Leven:\n\t"
-     " # neg inversor\n\t"
-     "  #movl 8(%%ebp), %[eax] \n\t"
-     " xorps   %%xmm1, %%xmm1\n\t"
-     "  movl   $0x80000000, (%[out])\n\t"
-     " movss   (%[out]), %%xmm1\n\t"
-     " shufps  $0x11, %%xmm1, %%xmm1   # b00010001 # 0 -0 0 -0\n\t"
-     " # pfpnacc\n\t"
-     " xorps   %%xmm1, %%xmm6\n\t"
-     " movaps  %%xmm6, %%xmm2\n\t"
-     " unpcklps %%xmm7, %%xmm6\n\t"
-     " unpckhps %%xmm7, %%xmm2\n\t"
-     " movaps  %%xmm2, %%xmm3\n\t"
-     " shufps  $0x44, %%xmm6, %%xmm2   # b01000100\n\t"
-     " shufps  $0xee, %%xmm3, %%xmm6   # b11101110\n\t"
-     " addps   %%xmm2, %%xmm6\n\t"
-     "                                 # xmm6 = r1 i2 r3 i4\n\t"
-     " #movl   8(%%ebp), %[eax]                # @result\n\t"
-     " movhlps %%xmm6, %%xmm4          # xmm4 = r3 i4 ?? ??\n\t"
-     " addps   %%xmm4, %%xmm6          # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
-     " movlps  %%xmm6, (%[out])                # store low 2x32 bits (complex) to memory\n\t"
-     " #popl   %%ebp\n\t"
-     :
-     : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator)
-     );
-
-  for(; leftovers > 0; leftovers -= 8) {
-    *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
-  }
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result,
+                                                                 const lv_32fc_t* input,
+                                                                 const lv_32fc_t* taps,
+                                                                 unsigned int num_points)
+{
+
+    const unsigned int num_bytes = num_points * 8;
+
+    __VOLK_ATTR_ALIGNED(16)
+    static const uint32_t conjugator[4] = {
+        0x00000000, 0x80000000, 0x00000000, 0x80000000
+    };
+
+    int bound = num_bytes >> 4;
+    int leftovers = num_bytes % 16;
+
+    __VOLK_ASM __VOLK_VOLATILE(
+        "      #pushl  %%ebp\n\t"
+        "      #movl   %%esp, %%ebp\n\t"
+        "      #movl   12(%%ebp), %%eax                # input\n\t"
+        "      #movl   16(%%ebp), %%edx                # taps\n\t"
+        "      #movl   20(%%ebp), %%ecx                # n_bytes\n\t"
+        "  movaps  0(%[conjugator]), %%xmm1\n\t"
+        "      xorps   %%xmm6, %%xmm6          # zero accumulators\n\t"
+        "      movaps  0(%[eax]), %%xmm0\n\t"
+        "      xorps   %%xmm7, %%xmm7          # zero accumulators\n\t"
+        "      movaps  0(%[edx]), %%xmm2\n\t"
+        "  movl    %[ecx], (%[out])\n\t"
+        "      shrl    $5, %[ecx]              # ecx = n_2_ccomplex_blocks / 2\n\t"
+
+        "  xorps   %%xmm1, %%xmm2\n\t"
+        "      jmp     .%=L1_test\n\t"
+        "      # 4 taps / loop\n\t"
+        "      # something like ?? cycles / loop\n\t"
+        ".%=Loop1:     \n\t"
+        "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+        "#     movaps  (%[eax]), %%xmmA\n\t"
+        "#     movaps  (%[edx]), %%xmmB\n\t"
+        "#     movaps  %%xmmA, %%xmmZ\n\t"
+        "#     shufps  $0xb1, %%xmmZ, %%xmmZ   # swap internals\n\t"
+        "#     mulps   %%xmmB, %%xmmA\n\t"
+        "#     mulps   %%xmmZ, %%xmmB\n\t"
+        "#     # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+        "#     xorps   %%xmmPN, %%xmmA\n\t"
+        "#     movaps  %%xmmA, %%xmmZ\n\t"
+        "#     unpcklps %%xmmB, %%xmmA\n\t"
+        "#     unpckhps %%xmmB, %%xmmZ\n\t"
+        "#     movaps  %%xmmZ, %%xmmY\n\t"
+        "#     shufps  $0x44, %%xmmA, %%xmmZ   # b01000100\n\t"
+        "#     shufps  $0xee, %%xmmY, %%xmmA   # b11101110\n\t"
+        "#     addps   %%xmmZ, %%xmmA\n\t"
+        "#     addps   %%xmmA, %%xmmC\n\t"
+        "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+        "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+        "      movaps  16(%[edx]), %%xmm3\n\t"
+        "      movaps  %%xmm0, %%xmm4\n\t"
+        "  xorps   %%xmm1, %%xmm3\n\t"
+        "      mulps   %%xmm2, %%xmm0\n\t"
+        "      movaps  16(%[eax]), %%xmm1\n\t"
+        "      shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
+        "      movaps  %%xmm1, %%xmm5\n\t"
+        "      addps   %%xmm0, %%xmm6\n\t"
+        "      mulps   %%xmm3, %%xmm1\n\t"
+        "      shufps  $0xb1, %%xmm5, %%xmm5   # swap internals\n\t"
+        "      addps   %%xmm1, %%xmm6\n\t"
+        "  movaps  0(%[conjugator]), %%xmm1\n\t"
+        "      mulps   %%xmm4, %%xmm2\n\t"
+        "      movaps  32(%[eax]), %%xmm0\n\t"
+        "      addps   %%xmm2, %%xmm7\n\t"
+        "      mulps   %%xmm5, %%xmm3\n\t"
+        "      addl    $32, %[eax]\n\t"
+        "      movaps  32(%[edx]), %%xmm2\n\t"
+        "      addps   %%xmm3, %%xmm7\n\t"
+        "  xorps   %%xmm1, %%xmm2\n\t"
+        "      addl    $32, %[edx]\n\t"
+        ".%=L1_test:\n\t"
+        "      decl    %[ecx]\n\t"
+        "      jge     .%=Loop1\n\t"
+        "      # We've handled the bulk of multiplies up to here.\n\t"
+        "      # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+        "      # If so, we've got 2 more taps to do.\n\t"
+        "      movl    0(%[out]), %[ecx]               # n_2_ccomplex_blocks\n\t"
+        "  shrl    $4, %[ecx]\n\t"
+        "      andl    $1, %[ecx]\n\t"
+        "      je      .%=Leven\n\t"
+        "      # The count was odd, do 2 more taps.\n\t"
+        "      # Note that we've already got mm0/mm2 preloaded\n\t"
+        "      # from the main loop.\n\t"
+        "      movaps  %%xmm0, %%xmm4\n\t"
+        "      mulps   %%xmm2, %%xmm0\n\t"
+        "      shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
+        "      addps   %%xmm0, %%xmm6\n\t"
+        "      mulps   %%xmm4, %%xmm2\n\t"
+        "      addps   %%xmm2, %%xmm7\n\t"
+        ".%=Leven:\n\t"
+        "      # neg inversor\n\t"
+        "  #movl 8(%%ebp), %[eax] \n\t"
+        "      xorps   %%xmm1, %%xmm1\n\t"
+        "  movl        $0x80000000, (%[out])\n\t"
+        "      movss   (%[out]), %%xmm1\n\t"
+        "      shufps  $0x11, %%xmm1, %%xmm1   # b00010001 # 0 -0 0 -0\n\t"
+        "      # pfpnacc\n\t"
+        "      xorps   %%xmm1, %%xmm6\n\t"
+        "      movaps  %%xmm6, %%xmm2\n\t"
+        "      unpcklps %%xmm7, %%xmm6\n\t"
+        "      unpckhps %%xmm7, %%xmm2\n\t"
+        "      movaps  %%xmm2, %%xmm3\n\t"
+        "      shufps  $0x44, %%xmm6, %%xmm2   # b01000100\n\t"
+        "      shufps  $0xee, %%xmm3, %%xmm6   # b11101110\n\t"
+        "      addps   %%xmm2, %%xmm6\n\t"
+        "                                      # xmm6 = r1 i2 r3 i4\n\t"
+        "      #movl   8(%%ebp), %[eax]                # @result\n\t"
+        "      movhlps %%xmm6, %%xmm4          # xmm4 = r3 i4 ?? ??\n\t"
+        "      addps   %%xmm4, %%xmm6          # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+        "      movlps  %%xmm6, (%[out])                # store low 2x32 bits (complex) "
+        "to memory\n\t"
+        "      #popl   %%ebp\n\t"
+        :
+        : [eax] "r"(input),
+          [edx] "r"(taps),
+          [ecx] "r"(num_bytes),
+          [out] "r"(result),
+          [conjugator] "r"(conjugator));
+
+    for (; leftovers > 0; leftovers -= 8) {
+        *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
+    }
  }
  #endif /*LV_HAVE_SSE*/
  
diff --git a/kernels/volk/volk_32fc_x2_divide_32fc.h b/kernels/volk/volk_32fc_x2_divide_32fc.h

index 3ce6edea94ef8eabffa54cc34931ffbec73779e9..78c245aae16715195d8f377f7e6b20cbbbee07ae 100644 (file)
--- a/kernels/volk/volk_32fc_x2_divide_32fc.h
+++ b/kernels/volk/volk_32fc_x2_divide_32fc.h
@@ -29,8 +29,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_x2_divide_32fc(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector, const lv_32fc_t* denumeratorVector, unsigned int num_points);
- * \endcode
+ * void volk_32fc_x2_divide_32fc(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
+ * const lv_32fc_t* denumeratorVector, unsigned int num_points); \endcode
   *
   * \b Inputs
   * \li numeratorVector: The numerator complex values.
@@ -41,7 +41,8 @@
   * \li outputVector: The output vector complex floats.
   *
   * \b Example
- * divide a complex vector by itself, demonstrating the result should be pretty close to 1+0j.
+ * divide a complex vector by itself, demonstrating the result should be pretty close to
+ * 1+0j.
   *
   * \code
   *   int N = 10;
@@ -71,17 +72,18 @@
  #ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
  #define INCLUDED_volk_32fc_x2_divide_32fc_u_H
  
+#include <float.h>
  #include <inttypes.h>
  #include <volk/volk_complex.h>
-#include <float.h>
  
  #ifdef LV_HAVE_SSE3
  #include <pmmintrin.h>
  #include <volk/volk_sse3_intrinsics.h>
  
-static inline void
-volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
-                                            const lv_32fc_t* denumeratorVector, unsigned int num_points)
+static inline void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector,
+                                                   const lv_32fc_t* numeratorVector,
+                                                   const lv_32fc_t* denumeratorVector,
+                                                   unsigned int num_points)
  {
      /*
       * we'll do the "classical"
@@ -89,44 +91,46 @@ volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVe
       * --- = -------
       *  b     |b|^2
       * */
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  __m128 num01, num23, den01, den23, norm, result;
-  lv_32fc_t* c = cVector;
-  const lv_32fc_t* a = numeratorVector;
-  const lv_32fc_t* b = denumeratorVector;
-
-  for(; number < quarterPoints; number++){
-    num01 = _mm_loadu_ps((float*) a);    // first pair
-    den01 = _mm_loadu_ps((float*) b);    // first pair
-    num01 = _mm_complexconjugatemul_ps(num01, den01);   // a conj(b)
-    a += 2;
-    b += 2;
-
-    num23 = _mm_loadu_ps((float*) a);    // second pair
-    den23 = _mm_loadu_ps((float*) b);    // second pair
-    num23 = _mm_complexconjugatemul_ps(num23, den23);   // a conj(b)
-    a += 2;
-    b += 2;
-
-    norm = _mm_magnitudesquared_ps_sse3(den01, den23);
-    den01 = _mm_unpacklo_ps(norm,norm);
-    den23 = _mm_unpackhi_ps(norm,norm);
-
-    result = _mm_div_ps(num01, den01);
-    _mm_storeu_ps((float*) c, result); // Store the results back into the C container
-    c += 2;
-    result = _mm_div_ps(num23, den23);
-    _mm_storeu_ps((float*) c, result); // Store the results back into the C container
-    c += 2;
-  }
-
-  number *= 4;
-  for(;number < num_points; number++){
-    *c = (*a) / (*b);
-    a++; b++; c++;
-  }
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m128 num01, num23, den01, den23, norm, result;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = numeratorVector;
+    const lv_32fc_t* b = denumeratorVector;
+
+    for (; number < quarterPoints; number++) {
+        num01 = _mm_loadu_ps((float*)a);                  // first pair
+        den01 = _mm_loadu_ps((float*)b);                  // first pair
+        num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
+        a += 2;
+        b += 2;
+
+        num23 = _mm_loadu_ps((float*)a);                  // second pair
+        den23 = _mm_loadu_ps((float*)b);                  // second pair
+        num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
+        a += 2;
+        b += 2;
+
+        norm = _mm_magnitudesquared_ps_sse3(den01, den23);
+        den01 = _mm_unpacklo_ps(norm, norm);
+        den23 = _mm_unpackhi_ps(norm, norm);
+
+        result = _mm_div_ps(num01, den01);
+        _mm_storeu_ps((float*)c, result); // Store the results back into the C container
+        c += 2;
+        result = _mm_div_ps(num23, den23);
+        _mm_storeu_ps((float*)c, result); // Store the results back into the C container
+        c += 2;
+    }
+
+    number *= 4;
+    for (; number < num_points; number++) {
+        *c = (*a) / (*b);
+        a++;
+        b++;
+        c++;
+    }
  }
  #endif /* LV_HAVE_SSE3 */
  
@@ -135,9 +139,10 @@ volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVe
  #include <immintrin.h>
  #include <volk/volk_avx_intrinsics.h>
  
-static inline void
-volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
-                                            const lv_32fc_t* denumeratorVector, unsigned int num_points)
+static inline void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector,
+                                                  const lv_32fc_t* numeratorVector,
+                                                  const lv_32fc_t* denumeratorVector,
+                                                  unsigned int num_points)
  {
      /*
       * we'll do the "classical"
@@ -153,17 +158,21 @@ volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVec
      const lv_32fc_t* a = numeratorVector;
      const lv_32fc_t* b = denumeratorVector;
  
-    for(; number < quarterPoints; number++){
-        num = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
-        denum = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+    for (; number < quarterPoints; number++) {
+        num = _mm256_loadu_ps(
+            (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+        denum = _mm256_loadu_ps(
+            (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
          mul_conj = _mm256_complexconjugatemul_ps(num, denum);
          sq = _mm256_mul_ps(denum, denum); // Square the values
-        mag_sq_un = _mm256_hadd_ps(sq,sq); // obtain the actual squared magnitude, although out of order
+        mag_sq_un = _mm256_hadd_ps(
+            sq, sq); // obtain the actual squared magnitude, although out of order
          mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
-        // best guide I found on using these functions: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
-        div = _mm256_div_ps(mul_conj,mag_sq);
+        // best guide I found on using these functions:
+        // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
+        div = _mm256_div_ps(mul_conj, mag_sq);
  
-        _mm256_storeu_ps((float*) c, div); // Store the results back into the C container
+        _mm256_storeu_ps((float*)c, div); // Store the results back into the C container
  
          a += 4;
          b += 4;
@@ -172,51 +181,51 @@ volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVec
  
      number = quarterPoints * 4;
  
-    for(; number < num_points; number++){
+    for (; number < num_points; number++) {
          *c++ = (*a++) / (*b++);
      }
-
  }
  #endif /* LV_HAVE_AVX */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                             const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector,
+                                                    const lv_32fc_t* aVector,
+                                                    const lv_32fc_t* bVector,
+                                                    unsigned int num_points)
  {
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  const lv_32fc_t* bPtr=  bVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = (*aPtr++) / (*bPtr++);
-  }
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
-
  #endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */
  
  
  #ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
  #define INCLUDED_volk_32fc_x2_divide_32fc_a_H
  
+#include <float.h>
  #include <inttypes.h>
  #include <stdio.h>
  #include <volk/volk_complex.h>
-#include <float.h>
  
  #ifdef LV_HAVE_SSE3
  #include <pmmintrin.h>
  #include <volk/volk_sse3_intrinsics.h>
  
-static inline void
-volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
-                                            const lv_32fc_t* denumeratorVector, unsigned int num_points)
+static inline void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector,
+                                                   const lv_32fc_t* numeratorVector,
+                                                   const lv_32fc_t* denumeratorVector,
+                                                   unsigned int num_points)
  {
      /*
       * we'll do the "classical"
@@ -224,45 +233,47 @@ volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVe
       * --- = -------
       *  b     |b|^2
       * */
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  __m128 num01, num23, den01, den23, norm, result;
-  lv_32fc_t* c = cVector;
-  const lv_32fc_t* a = numeratorVector;
-  const lv_32fc_t* b = denumeratorVector;
-
-  for(; number < quarterPoints; number++){
-    num01 = _mm_load_ps((float*) a);    // first pair
-    den01 = _mm_load_ps((float*) b);    // first pair
-    num01 = _mm_complexconjugatemul_ps(num01, den01);   // a conj(b)
-    a += 2;
-    b += 2;
-
-    num23 = _mm_load_ps((float*) a);    // second pair
-    den23 = _mm_load_ps((float*) b);    // second pair
-    num23 = _mm_complexconjugatemul_ps(num23, den23);   // a conj(b)
-    a += 2;
-    b += 2;
-
-    norm = _mm_magnitudesquared_ps_sse3(den01, den23);
-
-    den01 = _mm_unpacklo_ps(norm,norm); // select the lower floats twice
-    den23 = _mm_unpackhi_ps(norm,norm); // select the upper floats twice
-
-    result = _mm_div_ps(num01, den01);
-    _mm_store_ps((float*) c, result); // Store the results back into the C container
-    c += 2;
-    result = _mm_div_ps(num23, den23);
-    _mm_store_ps((float*) c, result); // Store the results back into the C container
-    c += 2;
-  }
-
-  number *= 4;
-  for(;number < num_points; number++){
-    *c = (*a) / (*b);
-    a++; b++; c++;
-  }
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m128 num01, num23, den01, den23, norm, result;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = numeratorVector;
+    const lv_32fc_t* b = denumeratorVector;
+
+    for (; number < quarterPoints; number++) {
+        num01 = _mm_load_ps((float*)a);                   // first pair
+        den01 = _mm_load_ps((float*)b);                   // first pair
+        num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
+        a += 2;
+        b += 2;
+
+        num23 = _mm_load_ps((float*)a);                   // second pair
+        den23 = _mm_load_ps((float*)b);                   // second pair
+        num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
+        a += 2;
+        b += 2;
+
+        norm = _mm_magnitudesquared_ps_sse3(den01, den23);
+
+        den01 = _mm_unpacklo_ps(norm, norm); // select the lower floats twice
+        den23 = _mm_unpackhi_ps(norm, norm); // select the upper floats twice
+
+        result = _mm_div_ps(num01, den01);
+        _mm_store_ps((float*)c, result); // Store the results back into the C container
+        c += 2;
+        result = _mm_div_ps(num23, den23);
+        _mm_store_ps((float*)c, result); // Store the results back into the C container
+        c += 2;
+    }
+
+    number *= 4;
+    for (; number < num_points; number++) {
+        *c = (*a) / (*b);
+        a++;
+        b++;
+        c++;
+    }
  }
  #endif /* LV_HAVE_SSE */
  
@@ -270,9 +281,10 @@ volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVe
  #include <immintrin.h>
  #include <volk/volk_avx_intrinsics.h>
  
-static inline void
-volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
-                                            const lv_32fc_t* denumeratorVector, unsigned int num_points)
+static inline void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector,
+                                                  const lv_32fc_t* numeratorVector,
+                                                  const lv_32fc_t* denumeratorVector,
+                                                  unsigned int num_points)
  {
      /*
       * we'll do the "classical"
@@ -288,17 +300,21 @@ volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVec
      const lv_32fc_t* a = numeratorVector;
      const lv_32fc_t* b = denumeratorVector;
  
-    for(; number < quarterPoints; number++){
-        num = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
-        denum = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+    for (; number < quarterPoints; number++) {
+        num =
+            _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+        denum =
+            _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
          mul_conj = _mm256_complexconjugatemul_ps(num, denum);
          sq = _mm256_mul_ps(denum, denum); // Square the values
-        mag_sq_un = _mm256_hadd_ps(sq,sq); // obtain the actual squared magnitude, although out of order
+        mag_sq_un = _mm256_hadd_ps(
+            sq, sq); // obtain the actual squared magnitude, although out of order
          mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
-        // best guide I found on using these functions: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
-        div = _mm256_div_ps(mul_conj,mag_sq);
+        // best guide I found on using these functions:
+        // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
+        div = _mm256_div_ps(mul_conj, mag_sq);
  
-        _mm256_store_ps((float*) c, div); // Store the results back into the C container
+        _mm256_store_ps((float*)c, div); // Store the results back into the C container
  
          a += 4;
          b += 4;
@@ -307,78 +323,78 @@ volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVec
  
      number = quarterPoints * 4;
  
-    for(; number < num_points; number++){
+    for (; number < num_points; number++) {
          *c++ = (*a++) / (*b++);
      }
-
-
  }
  #endif /* LV_HAVE_AVX */
  
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                             const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector,
+                                                 const lv_32fc_t* aVector,
+                                                 const lv_32fc_t* bVector,
+                                                 unsigned int num_points)
  {
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  const lv_32fc_t* bPtr = bVector;
-
-  float32x4x2_t aVal, bVal, cVal;
-  float32x4_t bAbs, bAbsInv;
-
-  const unsigned int quarterPoints = num_points / 4;
-  unsigned int number = 0;
-  for(; number < quarterPoints; number++){
-    aVal = vld2q_f32((const float*)(aPtr));
-    bVal = vld2q_f32((const float*)(bPtr));
-    aPtr += 4;
-    bPtr += 4;
-    __VOLK_PREFETCH(aPtr+4);
-    __VOLK_PREFETCH(bPtr+4);
-
-    bAbs = vmulq_f32(      bVal.val[0], bVal.val[0]);
-    bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
-
-    bAbsInv = vrecpeq_f32(bAbs);
-    bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
-    bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
-
-    cVal.val[0] = vmulq_f32(             aVal.val[0], bVal.val[0]);
-    cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
-    cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
-
-    cVal.val[1] = vmulq_f32(             aVal.val[1], bVal.val[0]);
-    cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
-    cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
-
-    vst2q_f32((float*)(cPtr), cVal);
-    cPtr += 4;
-  }
-
-  for(number = quarterPoints * 4; number < num_points; number++){
-    *cPtr++ = (*aPtr++) / (*bPtr++);
-  }
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
+
+    float32x4x2_t aVal, bVal, cVal;
+    float32x4_t bAbs, bAbsInv;
+
+    const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    for (; number < quarterPoints; number++) {
+        aVal = vld2q_f32((const float*)(aPtr));
+        bVal = vld2q_f32((const float*)(bPtr));
+        aPtr += 4;
+        bPtr += 4;
+        __VOLK_PREFETCH(aPtr + 4);
+        __VOLK_PREFETCH(bPtr + 4);
+
+        bAbs = vmulq_f32(bVal.val[0], bVal.val[0]);
+        bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
+
+        bAbsInv = vrecpeq_f32(bAbs);
+        bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
+        bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
+
+        cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]);
+        cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
+        cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
+
+        cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]);
+        cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
+        cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
+
+        vst2q_f32((float*)(cPtr), cVal);
+        cPtr += 4;
+    }
+
+    for (number = quarterPoints * 4; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_NEON */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32fc_x2_divide_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                               const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_divide_32fc_a_generic(lv_32fc_t* cVector,
+                                                      const lv_32fc_t* aVector,
+                                                      const lv_32fc_t* bVector,
+                                                      unsigned int num_points)
  {
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  const lv_32fc_t* bPtr=  bVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = (*aPtr++)  / (*bPtr++);
-  }
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
diff --git a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h

index f4a4469a92e7d580ccc8d70912426cf51d4d06f0..b0b7fee3f5e044e1283013cb44129df381ebd4c6 100644 (file)
--- a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
+++ b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
@@ -33,8 +33,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_x2_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
- * \endcode
+ * void volk_32fc_x2_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const
+ * lv_32fc_t* taps, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li input: vector of complex floats.
@@ -58,236 +58,246 @@
  #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
  #define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
  
-#include <volk/volk_common.h>
-#include <volk/volk_complex.h>
  #include <stdio.h>
  #include <string.h>
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
  
  
  #ifdef LV_HAVE_GENERIC
  
  
-static inline void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result,
+                                                      const lv_32fc_t* input,
+                                                      const lv_32fc_t* taps,
+                                                      unsigned int num_points)
+{
  
-  float * res = (float*) result;
-  float * in = (float*) input;
-  float * tp = (float*) taps;
-  unsigned int n_2_ccomplex_blocks = num_points/2;
+    float* res = (float*)result;
+    float* in = (float*)input;
+    float* tp = (float*)taps;
+    unsigned int n_2_ccomplex_blocks = num_points / 2;
  
-  float sum0[2] = {0,0};
-  float sum1[2] = {0,0};
-  unsigned int i = 0;
+    float sum0[2] = { 0, 0 };
+    float sum1[2] = { 0, 0 };
+    unsigned int i = 0;
  
-  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
-    sum0[0] += in[0] * tp[0] - in[1] * tp[1];
-    sum0[1] += in[0] * tp[1] + in[1] * tp[0];
-    sum1[0] += in[2] * tp[2] - in[3] * tp[3];
-    sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+    for (i = 0; i < n_2_ccomplex_blocks; ++i) {
+        sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+        sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+        sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+        sum1[1] += in[2] * tp[3] + in[3] * tp[2];
  
-    in += 4;
-    tp += 4;
-  }
+        in += 4;
+        tp += 4;
+    }
  
-  res[0] = sum0[0] + sum1[0];
-  res[1] = sum0[1] + sum1[1];
+    res[0] = sum0[0] + sum1[0];
+    res[1] = sum0[1] + sum1[1];
  
-  // Cleanup if we had an odd number of points
-  if (num_points & 1) {
-    *result += input[num_points - 1] * taps[num_points - 1];
-  }
+    // Cleanup if we had an odd number of points
+    if (num_points & 1) {
+        *result += input[num_points - 1] * taps[num_points - 1];
+    }
  }
  
  #endif /*LV_HAVE_GENERIC*/
  
  
-
  #if LV_HAVE_SSE && LV_HAVE_64
  
-static inline void volk_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
-
-  const unsigned int num_bytes = num_points*8;
-  unsigned int isodd = num_points & 1;
-
-  __VOLK_ASM
-    (
-     "#  ccomplex_dotprod_generic (float* result, const float *input,\n\t"
-     "#                         const float *taps, unsigned num_bytes)\n\t"
-     "#    float sum0 = 0;\n\t"
-     "#    float sum1 = 0;\n\t"
-     "#    float sum2 = 0;\n\t"
-     "#    float sum3 = 0;\n\t"
-     "#    do {\n\t"
-     "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
-     "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
-     "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
-     "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
-     "#      input += 4;\n\t"
-     "#      taps += 4;  \n\t"
-     "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
-     "#    result[0] = sum0 + sum2;\n\t"
-     "#    result[1] = sum1 + sum3;\n\t"
-     "# TODO: prefetch and better scheduling\n\t"
-     "  xor    %%r9,  %%r9\n\t"
-     "  xor    %%r10, %%r10\n\t"
-     "  movq   %%rcx, %%rax\n\t"
-     "  movq   %%rcx, %%r8\n\t"
-     "  movq   %[rsi],  %%r9\n\t"
-     "  movq   %[rdx], %%r10\n\t"
-     " xorps   %%xmm6, %%xmm6          # zero accumulators\n\t"
-     " movups  0(%%r9), %%xmm0\n\t"
-     " xorps   %%xmm7, %%xmm7          # zero accumulators\n\t"
-     " movups  0(%%r10), %%xmm2\n\t"
-     " shr     $5, %%rax               # rax = n_2_ccomplex_blocks / 2\n\t"
-     "  shr     $4, %%r8\n\t"
-     " jmp     .%=L1_test\n\t"
-     " # 4 taps / loop\n\t"
-     " # something like ?? cycles / loop\n\t"
-     ".%=Loop1:        \n\t"
-     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
-     "#        movups  (%%r9), %%xmmA\n\t"
-     "#        movups  (%%r10), %%xmmB\n\t"
-     "#        movups  %%xmmA, %%xmmZ\n\t"
-     "#        shufps  $0xb1, %%xmmZ, %%xmmZ   # swap internals\n\t"
-     "#        mulps   %%xmmB, %%xmmA\n\t"
-     "#        mulps   %%xmmZ, %%xmmB\n\t"
-     "#        # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
-     "#        xorps   %%xmmPN, %%xmmA\n\t"
-     "#        movups  %%xmmA, %%xmmZ\n\t"
-     "#        unpcklps %%xmmB, %%xmmA\n\t"
-     "#        unpckhps %%xmmB, %%xmmZ\n\t"
-     "#        movups  %%xmmZ, %%xmmY\n\t"
-     "#        shufps  $0x44, %%xmmA, %%xmmZ   # b01000100\n\t"
-     "#        shufps  $0xee, %%xmmY, %%xmmA   # b11101110\n\t"
-     "#        addps   %%xmmZ, %%xmmA\n\t"
-     "#        addps   %%xmmA, %%xmmC\n\t"
-     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
-     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
-     " movups  16(%%r9), %%xmm1\n\t"
-     " movups  %%xmm0, %%xmm4\n\t"
-     " mulps   %%xmm2, %%xmm0\n\t"
-     " shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
-     " movups  16(%%r10), %%xmm3\n\t"
-     " movups  %%xmm1, %%xmm5\n\t"
-     " addps   %%xmm0, %%xmm6\n\t"
-     " mulps   %%xmm3, %%xmm1\n\t"
-     " shufps  $0xb1, %%xmm5, %%xmm5   # swap internals\n\t"
-     " addps   %%xmm1, %%xmm6\n\t"
-     " mulps   %%xmm4, %%xmm2\n\t"
-     " movups  32(%%r9), %%xmm0\n\t"
-     " addps   %%xmm2, %%xmm7\n\t"
-     " mulps   %%xmm5, %%xmm3\n\t"
-     " add     $32, %%r9\n\t"
-     " movups  32(%%r10), %%xmm2\n\t"
-     " addps   %%xmm3, %%xmm7\n\t"
-     " add     $32, %%r10\n\t"
-     ".%=L1_test:\n\t"
-     " dec     %%rax\n\t"
-     " jge     .%=Loop1\n\t"
-     " # We've handled the bulk of multiplies up to here.\n\t"
-     " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
-     " # If so, we've got 2 more taps to do.\n\t"
-     " and     $1, %%r8\n\t"
-     " je      .%=Leven\n\t"
-     " # The count was odd, do 2 more taps.\n\t"
-     " # Note that we've already got mm0/mm2 preloaded\n\t"
-     " # from the main loop.\n\t"
-     " movups  %%xmm0, %%xmm4\n\t"
-     " mulps   %%xmm2, %%xmm0\n\t"
-     " shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
-     " addps   %%xmm0, %%xmm6\n\t"
-     " mulps   %%xmm4, %%xmm2\n\t"
-     " addps   %%xmm2, %%xmm7\n\t"
-     ".%=Leven:\n\t"
-     " # neg inversor\n\t"
-     " xorps   %%xmm1, %%xmm1\n\t"
-     " mov     $0x80000000, %%r9\n\t"
-     " movd    %%r9, %%xmm1\n\t"
-     " shufps  $0x11, %%xmm1, %%xmm1   # b00010001 # 0 -0 0 -0\n\t"
-     " # pfpnacc\n\t"
-     " xorps   %%xmm1, %%xmm6\n\t"
-     " movups  %%xmm6, %%xmm2\n\t"
-     " unpcklps %%xmm7, %%xmm6\n\t"
-     " unpckhps %%xmm7, %%xmm2\n\t"
-     " movups  %%xmm2, %%xmm3\n\t"
-     " shufps  $0x44, %%xmm6, %%xmm2   # b01000100\n\t"
-     " shufps  $0xee, %%xmm3, %%xmm6   # b11101110\n\t"
-     " addps   %%xmm2, %%xmm6\n\t"
-     "                                 # xmm6 = r1 i2 r3 i4\n\t"
-     " movhlps %%xmm6, %%xmm4          # xmm4 = r3 i4 ?? ??\n\t"
-     " addps   %%xmm4, %%xmm6          # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
-     " movlps  %%xmm6, (%[rdi])                # store low 2x32 bits (complex) to memory\n\t"
-     :
-     :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
-     :"rax", "r8", "r9", "r10"
-     );
-
-
-  if(isodd) {
-    *result += input[num_points - 1] * taps[num_points - 1];
-  }
-
-  return;
+static inline void volk_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result,
+                                                       const lv_32fc_t* input,
+                                                       const lv_32fc_t* taps,
+                                                       unsigned int num_points)
+{
+
+    const unsigned int num_bytes = num_points * 8;
+    unsigned int isodd = num_points & 1;
+
+    __VOLK_ASM(
+        "#  ccomplex_dotprod_generic (float* result, const float *input,\n\t"
+        "#                         const float *taps, unsigned num_bytes)\n\t"
+        "#    float sum0 = 0;\n\t"
+        "#    float sum1 = 0;\n\t"
+        "#    float sum2 = 0;\n\t"
+        "#    float sum3 = 0;\n\t"
+        "#    do {\n\t"
+        "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+        "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+        "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+        "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+        "#      input += 4;\n\t"
+        "#      taps += 4;  \n\t"
+        "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
+        "#    result[0] = sum0 + sum2;\n\t"
+        "#    result[1] = sum1 + sum3;\n\t"
+        "# TODO: prefetch and better scheduling\n\t"
+        "  xor    %%r9,  %%r9\n\t"
+        "  xor    %%r10, %%r10\n\t"
+        "  movq   %%rcx, %%rax\n\t"
+        "  movq   %%rcx, %%r8\n\t"
+        "  movq   %[rsi],  %%r9\n\t"
+        "  movq   %[rdx], %%r10\n\t"
+        "      xorps   %%xmm6, %%xmm6          # zero accumulators\n\t"
+        "      movups  0(%%r9), %%xmm0\n\t"
+        "      xorps   %%xmm7, %%xmm7          # zero accumulators\n\t"
+        "      movups  0(%%r10), %%xmm2\n\t"
+        "      shr     $5, %%rax               # rax = n_2_ccomplex_blocks / 2\n\t"
+        "  shr     $4, %%r8\n\t"
+        "      jmp     .%=L1_test\n\t"
+        "      # 4 taps / loop\n\t"
+        "      # something like ?? cycles / loop\n\t"
+        ".%=Loop1:     \n\t"
+        "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+        "#     movups  (%%r9), %%xmmA\n\t"
+        "#     movups  (%%r10), %%xmmB\n\t"
+        "#     movups  %%xmmA, %%xmmZ\n\t"
+        "#     shufps  $0xb1, %%xmmZ, %%xmmZ   # swap internals\n\t"
+        "#     mulps   %%xmmB, %%xmmA\n\t"
+        "#     mulps   %%xmmZ, %%xmmB\n\t"
+        "#     # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+        "#     xorps   %%xmmPN, %%xmmA\n\t"
+        "#     movups  %%xmmA, %%xmmZ\n\t"
+        "#     unpcklps %%xmmB, %%xmmA\n\t"
+        "#     unpckhps %%xmmB, %%xmmZ\n\t"
+        "#     movups  %%xmmZ, %%xmmY\n\t"
+        "#     shufps  $0x44, %%xmmA, %%xmmZ   # b01000100\n\t"
+        "#     shufps  $0xee, %%xmmY, %%xmmA   # b11101110\n\t"
+        "#     addps   %%xmmZ, %%xmmA\n\t"
+        "#     addps   %%xmmA, %%xmmC\n\t"
+        "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+        "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+        "      movups  16(%%r9), %%xmm1\n\t"
+        "      movups  %%xmm0, %%xmm4\n\t"
+        "      mulps   %%xmm2, %%xmm0\n\t"
+        "      shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
+        "      movups  16(%%r10), %%xmm3\n\t"
+        "      movups  %%xmm1, %%xmm5\n\t"
+        "      addps   %%xmm0, %%xmm6\n\t"
+        "      mulps   %%xmm3, %%xmm1\n\t"
+        "      shufps  $0xb1, %%xmm5, %%xmm5   # swap internals\n\t"
+        "      addps   %%xmm1, %%xmm6\n\t"
+        "      mulps   %%xmm4, %%xmm2\n\t"
+        "      movups  32(%%r9), %%xmm0\n\t"
+        "      addps   %%xmm2, %%xmm7\n\t"
+        "      mulps   %%xmm5, %%xmm3\n\t"
+        "      add     $32, %%r9\n\t"
+        "      movups  32(%%r10), %%xmm2\n\t"
+        "      addps   %%xmm3, %%xmm7\n\t"
+        "      add     $32, %%r10\n\t"
+        ".%=L1_test:\n\t"
+        "      dec     %%rax\n\t"
+        "      jge     .%=Loop1\n\t"
+        "      # We've handled the bulk of multiplies up to here.\n\t"
+        "      # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+        "      # If so, we've got 2 more taps to do.\n\t"
+        "      and     $1, %%r8\n\t"
+        "      je      .%=Leven\n\t"
+        "      # The count was odd, do 2 more taps.\n\t"
+        "      # Note that we've already got mm0/mm2 preloaded\n\t"
+        "      # from the main loop.\n\t"
+        "      movups  %%xmm0, %%xmm4\n\t"
+        "      mulps   %%xmm2, %%xmm0\n\t"
+        "      shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
+        "      addps   %%xmm0, %%xmm6\n\t"
+        "      mulps   %%xmm4, %%xmm2\n\t"
+        "      addps   %%xmm2, %%xmm7\n\t"
+        ".%=Leven:\n\t"
+        "      # neg inversor\n\t"
+        "      xorps   %%xmm1, %%xmm1\n\t"
+        "      mov     $0x80000000, %%r9\n\t"
+        "      movd    %%r9, %%xmm1\n\t"
+        "      shufps  $0x11, %%xmm1, %%xmm1   # b00010001 # 0 -0 0 -0\n\t"
+        "      # pfpnacc\n\t"
+        "      xorps   %%xmm1, %%xmm6\n\t"
+        "      movups  %%xmm6, %%xmm2\n\t"
+        "      unpcklps %%xmm7, %%xmm6\n\t"
+        "      unpckhps %%xmm7, %%xmm2\n\t"
+        "      movups  %%xmm2, %%xmm3\n\t"
+        "      shufps  $0x44, %%xmm6, %%xmm2   # b01000100\n\t"
+        "      shufps  $0xee, %%xmm3, %%xmm6   # b11101110\n\t"
+        "      addps   %%xmm2, %%xmm6\n\t"
+        "                                      # xmm6 = r1 i2 r3 i4\n\t"
+        "      movhlps %%xmm6, %%xmm4          # xmm4 = r3 i4 ?? ??\n\t"
+        "      addps   %%xmm4, %%xmm6          # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+        "      movlps  %%xmm6, (%[rdi])                # store low 2x32 bits (complex) "
+        "to memory\n\t"
+        :
+        : [rsi] "r"(input), [rdx] "r"(taps), "c"(num_bytes), [rdi] "r"(result)
+        : "rax", "r8", "r9", "r10");
+
+
+    if (isodd) {
+        *result += input[num_points - 1] * taps[num_points - 1];
+    }
  
+    return;
  }
  
  #endif /* LV_HAVE_SSE && LV_HAVE_64 */
  
  
-
-
  #ifdef LV_HAVE_SSE3
  
  #include <pmmintrin.h>
  
-static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result,
+                                                     const lv_32fc_t* input,
+                                                     const lv_32fc_t* taps,
+                                                     unsigned int num_points)
+{
  
-  lv_32fc_t dotProduct;
-  memset(&dotProduct, 0x0, 2*sizeof(float));
+    lv_32fc_t dotProduct;
+    memset(&dotProduct, 0x0, 2 * sizeof(float));
  
-  unsigned int number = 0;
-  const unsigned int halfPoints = num_points/2;
-  unsigned int isodd = num_points & 1;
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+    unsigned int isodd = num_points & 1;
  
-  __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+    __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
  
-  const lv_32fc_t* a = input;
-  const lv_32fc_t* b = taps;
+    const lv_32fc_t* a = input;
+    const lv_32fc_t* b = taps;
  
-  dotProdVal = _mm_setzero_ps();
+    dotProdVal = _mm_setzero_ps();
  
-  for(;number < halfPoints; number++){
+    for (; number < halfPoints; number++) {
  
-    x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-    y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+        x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+        y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
  
-    yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
-    yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+        yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+        yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
  
-    tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+        tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
  
-    x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+        x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
  
-    tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+        tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
  
-    z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+        z = _mm_addsub_ps(tmp1,
+                          tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
  
-    dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+        dotProdVal =
+            _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
  
-    a += 2;
-    b += 2;
-  }
+        a += 2;
+        b += 2;
+    }
  
-  __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
+    __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
  
-  _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+    _mm_storeu_ps((float*)dotProductVector,
+                  dotProdVal); // Store the results back into the dot product vector
  
-  dotProduct += ( dotProductVector[0] + dotProductVector[1] );
+    dotProduct += (dotProductVector[0] + dotProductVector[1]);
  
-  if(isodd) {
-    dotProduct += input[num_points - 1] * taps[num_points - 1];
-  }
+    if (isodd) {
+        dotProduct += input[num_points - 1] * taps[num_points - 1];
+    }
  
-  *result = dotProduct;
+    *result = dotProduct;
  }
  
  #endif /*LV_HAVE_SSE3*/
@@ -296,78 +306,82 @@ static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv
  
  #include <smmintrin.h>
  
-static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result,
+                                                       const lv_32fc_t* input,
+                                                       const lv_32fc_t* taps,
+                                                       unsigned int num_points)
+{
  
-  unsigned int i = 0;
-  const unsigned int qtr_points = num_points/4;
-  const unsigned int isodd = num_points & 3;
+    unsigned int i = 0;
+    const unsigned int qtr_points = num_points / 4;
+    const unsigned int isodd = num_points & 3;
  
-  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
-  float *p_input, *p_taps;
-  __m64 *p_result;
+    __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
+    float *p_input, *p_taps;
+    __m64* p_result;
  
-  p_result = (__m64*)result;
-  p_input = (float*)input;
-  p_taps = (float*)taps;
+    p_result = (__m64*)result;
+    p_input = (float*)input;
+    p_taps = (float*)taps;
  
-  static const __m128i neg = {0x000000000000000080000000};
+    static const __m128i neg = { 0x000000000000000080000000 };
  
-  real0 = _mm_setzero_ps();
-  real1 = _mm_setzero_ps();
-  im0 = _mm_setzero_ps();
-  im1 = _mm_setzero_ps();
+    real0 = _mm_setzero_ps();
+    real1 = _mm_setzero_ps();
+    im0 = _mm_setzero_ps();
+    im1 = _mm_setzero_ps();
  
-  for(; i < qtr_points; ++i) {
-    xmm0 = _mm_loadu_ps(p_input);
-    xmm1 = _mm_loadu_ps(p_taps);
+    for (; i < qtr_points; ++i) {
+        xmm0 = _mm_loadu_ps(p_input);
+        xmm1 = _mm_loadu_ps(p_taps);
  
-    p_input += 4;
-    p_taps += 4;
+        p_input += 4;
+        p_taps += 4;
  
-    xmm2 = _mm_loadu_ps(p_input);
-    xmm3 = _mm_loadu_ps(p_taps);
+        xmm2 = _mm_loadu_ps(p_input);
+        xmm3 = _mm_loadu_ps(p_taps);
  
-    p_input += 4;
-    p_taps += 4;
+        p_input += 4;
+        p_taps += 4;
  
-    xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
-    xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
-    xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
-    xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
+        xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
+        xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
+        xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
+        xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
  
-    //imaginary vector from input
-    xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
-    //real vector from input
-    xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
-    //imaginary vector from taps
-    xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
-    //real vector from taps
-    xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
+        // imaginary vector from input
+        xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
+        // real vector from input
+        xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
+        // imaginary vector from taps
+        xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
+        // real vector from taps
+        xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
  
-    xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
-    xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
+        xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
+        xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
  
-    xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
-    xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
+        xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
+        xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
  
-    real0 = _mm_add_ps(xmm4, real0);
-    real1 = _mm_add_ps(xmm5, real1);
-    im0 = _mm_add_ps(xmm6, im0);
-    im1 = _mm_add_ps(xmm7, im1);
-  }
+        real0 = _mm_add_ps(xmm4, real0);
+        real1 = _mm_add_ps(xmm5, real1);
+        im0 = _mm_add_ps(xmm6, im0);
+        im1 = _mm_add_ps(xmm7, im1);
+    }
  
-  real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
+    real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
  
-  im0 = _mm_add_ps(im0, im1);
-  real0 = _mm_add_ps(real0, real1);
+    im0 = _mm_add_ps(im0, im1);
+    real0 = _mm_add_ps(real0, real1);
  
-  im0 = _mm_add_ps(im0, real0);
+    im0 = _mm_add_ps(im0, real0);
  
-  _mm_storel_pi(p_result, im0);
+    _mm_storel_pi(p_result, im0);
  
-  for(i = num_points-isodd; i < num_points; i++) {
-    *result += input[i] * taps[i];
-  }
+    for (i = num_points - isodd; i < num_points; i++) {
+        *result += input[i] * taps[i];
+    }
  }
  
  #endif /*LV_HAVE_SSE4_1*/
@@ -376,55 +390,63 @@ static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, const
  
  #include <immintrin.h>
  
-static inline void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t* result,
+                                                    const lv_32fc_t* input,
+                                                    const lv_32fc_t* taps,
+                                                    unsigned int num_points)
+{
  
-  unsigned int isodd = num_points & 3;
-  unsigned int i = 0;
-  lv_32fc_t dotProduct;
-  memset(&dotProduct, 0x0, 2*sizeof(float));
+    unsigned int isodd = num_points & 3;
+    unsigned int i = 0;
+    lv_32fc_t dotProduct;
+    memset(&dotProduct, 0x0, 2 * sizeof(float));
  
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+    __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
  
-  const lv_32fc_t* a = input;
-  const lv_32fc_t* b = taps;
+    const lv_32fc_t* a = input;
+    const lv_32fc_t* b = taps;
  
-  dotProdVal = _mm256_setzero_ps();
+    dotProdVal = _mm256_setzero_ps();
  
-  for(;number < quarterPoints; number++){
-    x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
-    y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
+    for (; number < quarterPoints; number++) {
+        x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
+        y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
  
-    yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
-    yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
+        yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
+        yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
  
-    tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+        tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
  
-    x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
+        x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
  
-    tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+        tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
  
-    z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+        z = _mm256_addsub_ps(tmp1,
+                             tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
  
-    dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex multiplication results together
+        dotProdVal = _mm256_add_ps(dotProdVal,
+                                   z); // Add the complex multiplication results together
  
-    a += 4;
-    b += 4;
-  }
+        a += 4;
+        b += 4;
+    }
  
-  __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
+    __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
  
-  _mm256_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+    _mm256_storeu_ps((float*)dotProductVector,
+                     dotProdVal); // Store the results back into the dot product vector
  
-  dotProduct += ( dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + dotProductVector[3]);
+    dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
+                   dotProductVector[3]);
  
-  for(i = num_points-isodd; i < num_points; i++) {
-    dotProduct += input[i] * taps[i];
-  }
+    for (i = num_points - isodd; i < num_points; i++) {
+        dotProduct += input[i] * taps[i];
+    }
  
-  *result = dotProduct;
+    *result = dotProduct;
  }
  
  #endif /*LV_HAVE_AVX*/
@@ -432,56 +454,64 @@ static inline void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t* result, const lv_
  #if LV_HAVE_AVX && LV_HAVE_FMA
  #include <immintrin.h>
  
-static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(lv_32fc_t* result,
+                                                        const lv_32fc_t* input,
+                                                        const lv_32fc_t* taps,
+                                                        unsigned int num_points)
+{
  
-  unsigned int isodd = num_points & 3;
-  unsigned int i = 0;
-  lv_32fc_t dotProduct;
-  memset(&dotProduct, 0x0, 2*sizeof(float));
+    unsigned int isodd = num_points & 3;
+    unsigned int i = 0;
+    lv_32fc_t dotProduct;
+    memset(&dotProduct, 0x0, 2 * sizeof(float));
  
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+    __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
  
-  const lv_32fc_t* a = input;
-  const lv_32fc_t* b = taps;
+    const lv_32fc_t* a = input;
+    const lv_32fc_t* b = taps;
  
-  dotProdVal = _mm256_setzero_ps();
+    dotProdVal = _mm256_setzero_ps();
  
-  for(;number < quarterPoints; number++){
+    for (; number < quarterPoints; number++) {
  
-    x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
-    y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
+        x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
+        y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
  
-    yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
-    yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
+        yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
+        yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
  
-    tmp1 = x;
+        tmp1 = x;
  
-    x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
+        x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
  
-    tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+        tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
  
-    z = _mm256_fmaddsub_ps(tmp1, yl,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+        z = _mm256_fmaddsub_ps(
+            tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
  
-    dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex multiplication results together
+        dotProdVal = _mm256_add_ps(dotProdVal,
+                                   z); // Add the complex multiplication results together
  
-    a += 4;
-    b += 4;
-  }
+        a += 4;
+        b += 4;
+    }
  
-  __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
+    __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
  
-  _mm256_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+    _mm256_storeu_ps((float*)dotProductVector,
+                     dotProdVal); // Store the results back into the dot product vector
  
-  dotProduct += ( dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + dotProductVector[3]);
+    dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
+                   dotProductVector[3]);
  
-  for(i = num_points-isodd; i < num_points; i++) {
-    dotProduct += input[i] * taps[i];
-  }
+    for (i = num_points - isodd; i < num_points; i++) {
+        dotProduct += input[i] * taps[i];
+    }
  
-  *result = dotProduct;
+    *result = dotProduct;
  }
  
  #endif /*LV_HAVE_AVX && LV_HAVE_FMA*/
@@ -491,44 +521,48 @@ static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(lv_32fc_t* result, const
  #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
  #define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
  
-#include <volk/volk_common.h>
-#include <volk/volk_complex.h>
  #include <stdio.h>
  #include <string.h>
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
  
  
  #ifdef LV_HAVE_GENERIC
  
  
-static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result,
+                                                        const lv_32fc_t* input,
+                                                        const lv_32fc_t* taps,
+                                                        unsigned int num_points)
+{
  
-  const unsigned int num_bytes = num_points*8;
+    const unsigned int num_bytes = num_points * 8;
  
-  float * res = (float*) result;
-  float * in = (float*) input;
-  float * tp = (float*) taps;
-  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+    float* res = (float*)result;
+    float* in = (float*)input;
+    float* tp = (float*)taps;
+    unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
  
-  float sum0[2] = {0,0};
-  float sum1[2] = {0,0};
-  unsigned int i = 0;
+    float sum0[2] = { 0, 0 };
+    float sum1[2] = { 0, 0 };
+    unsigned int i = 0;
  
-  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
-    sum0[0] += in[0] * tp[0] - in[1] * tp[1];
-    sum0[1] += in[0] * tp[1] + in[1] * tp[0];
-    sum1[0] += in[2] * tp[2] - in[3] * tp[3];
-    sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+    for (i = 0; i < n_2_ccomplex_blocks; ++i) {
+        sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+        sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+        sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+        sum1[1] += in[2] * tp[3] + in[3] * tp[2];
  
-    in += 4;
-    tp += 4;
-  }
+        in += 4;
+        tp += 4;
+    }
  
-  res[0] = sum0[0] + sum1[0];
-  res[1] = sum0[1] + sum1[1];
+    res[0] = sum0[0] + sum1[0];
+    res[1] = sum0[1] + sum1[1];
  
-  if (num_points & 1) {
-    *result += input[num_points - 1] * taps[num_points - 1];
-  }
+    if (num_points & 1) {
+        *result += input[num_points - 1] * taps[num_points - 1];
+    }
  }
  
  #endif /*LV_HAVE_GENERIC*/
@@ -537,140 +571,146 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const
  #if LV_HAVE_SSE && LV_HAVE_64
  
  
-static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
-
-  const unsigned int num_bytes = num_points*8;
-  unsigned int isodd = num_points & 1;
-
-  __VOLK_ASM
-    (
-     "#  ccomplex_dotprod_generic (float* result, const float *input,\n\t"
-     "#                         const float *taps, unsigned num_bytes)\n\t"
-     "#    float sum0 = 0;\n\t"
-     "#    float sum1 = 0;\n\t"
-     "#    float sum2 = 0;\n\t"
-     "#    float sum3 = 0;\n\t"
-     "#    do {\n\t"
-     "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
-     "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
-     "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
-     "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
-     "#      input += 4;\n\t"
-     "#      taps += 4;  \n\t"
-     "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
-     "#    result[0] = sum0 + sum2;\n\t"
-     "#    result[1] = sum1 + sum3;\n\t"
-     "# TODO: prefetch and better scheduling\n\t"
-     "  xor    %%r9,  %%r9\n\t"
-     "  xor    %%r10, %%r10\n\t"
-     "  movq   %%rcx, %%rax\n\t"
-     "  movq   %%rcx, %%r8\n\t"
-     "  movq   %[rsi],  %%r9\n\t"
-     "  movq   %[rdx], %%r10\n\t"
-     " xorps   %%xmm6, %%xmm6          # zero accumulators\n\t"
-     " movaps  0(%%r9), %%xmm0\n\t"
-     " xorps   %%xmm7, %%xmm7          # zero accumulators\n\t"
-     " movaps  0(%%r10), %%xmm2\n\t"
-     " shr     $5, %%rax               # rax = n_2_ccomplex_blocks / 2\n\t"
-     "  shr     $4, %%r8\n\t"
-     " jmp     .%=L1_test\n\t"
-     " # 4 taps / loop\n\t"
-     " # something like ?? cycles / loop\n\t"
-     ".%=Loop1:        \n\t"
-     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
-     "#        movaps  (%%r9), %%xmmA\n\t"
-     "#        movaps  (%%r10), %%xmmB\n\t"
-     "#        movaps  %%xmmA, %%xmmZ\n\t"
-     "#        shufps  $0xb1, %%xmmZ, %%xmmZ   # swap internals\n\t"
-     "#        mulps   %%xmmB, %%xmmA\n\t"
-     "#        mulps   %%xmmZ, %%xmmB\n\t"
-     "#        # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
-     "#        xorps   %%xmmPN, %%xmmA\n\t"
-     "#        movaps  %%xmmA, %%xmmZ\n\t"
-     "#        unpcklps %%xmmB, %%xmmA\n\t"
-     "#        unpckhps %%xmmB, %%xmmZ\n\t"
-     "#        movaps  %%xmmZ, %%xmmY\n\t"
-     "#        shufps  $0x44, %%xmmA, %%xmmZ   # b01000100\n\t"
-     "#        shufps  $0xee, %%xmmY, %%xmmA   # b11101110\n\t"
-     "#        addps   %%xmmZ, %%xmmA\n\t"
-     "#        addps   %%xmmA, %%xmmC\n\t"
-     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
-     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
-     " movaps  16(%%r9), %%xmm1\n\t"
-     " movaps  %%xmm0, %%xmm4\n\t"
-     " mulps   %%xmm2, %%xmm0\n\t"
-     " shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
-     " movaps  16(%%r10), %%xmm3\n\t"
-     " movaps  %%xmm1, %%xmm5\n\t"
-     " addps   %%xmm0, %%xmm6\n\t"
-     " mulps   %%xmm3, %%xmm1\n\t"
-     " shufps  $0xb1, %%xmm5, %%xmm5   # swap internals\n\t"
-     " addps   %%xmm1, %%xmm6\n\t"
-     " mulps   %%xmm4, %%xmm2\n\t"
-     " movaps  32(%%r9), %%xmm0\n\t"
-     " addps   %%xmm2, %%xmm7\n\t"
-     " mulps   %%xmm5, %%xmm3\n\t"
-     " add     $32, %%r9\n\t"
-     " movaps  32(%%r10), %%xmm2\n\t"
-     " addps   %%xmm3, %%xmm7\n\t"
-     " add     $32, %%r10\n\t"
-     ".%=L1_test:\n\t"
-     " dec     %%rax\n\t"
-     " jge     .%=Loop1\n\t"
-     " # We've handled the bulk of multiplies up to here.\n\t"
-     " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
-     " # If so, we've got 2 more taps to do.\n\t"
-     " and     $1, %%r8\n\t"
-     " je      .%=Leven\n\t"
-     " # The count was odd, do 2 more taps.\n\t"
-     " # Note that we've already got mm0/mm2 preloaded\n\t"
-     " # from the main loop.\n\t"
-     " movaps  %%xmm0, %%xmm4\n\t"
-     " mulps   %%xmm2, %%xmm0\n\t"
-     " shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
-     " addps   %%xmm0, %%xmm6\n\t"
-     " mulps   %%xmm4, %%xmm2\n\t"
-     " addps   %%xmm2, %%xmm7\n\t"
-     ".%=Leven:\n\t"
-     " # neg inversor\n\t"
-     " xorps   %%xmm1, %%xmm1\n\t"
-     " mov     $0x80000000, %%r9\n\t"
-     " movd    %%r9, %%xmm1\n\t"
-     " shufps  $0x11, %%xmm1, %%xmm1   # b00010001 # 0 -0 0 -0\n\t"
-     " # pfpnacc\n\t"
-     " xorps   %%xmm1, %%xmm6\n\t"
-     " movaps  %%xmm6, %%xmm2\n\t"
-     " unpcklps %%xmm7, %%xmm6\n\t"
-     " unpckhps %%xmm7, %%xmm2\n\t"
-     " movaps  %%xmm2, %%xmm3\n\t"
-     " shufps  $0x44, %%xmm6, %%xmm2   # b01000100\n\t"
-     " shufps  $0xee, %%xmm3, %%xmm6   # b11101110\n\t"
-     " addps   %%xmm2, %%xmm6\n\t"
-     "                                 # xmm6 = r1 i2 r3 i4\n\t"
-     " movhlps %%xmm6, %%xmm4          # xmm4 = r3 i4 ?? ??\n\t"
-     " addps   %%xmm4, %%xmm6          # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
-     " movlps  %%xmm6, (%[rdi])                # store low 2x32 bits (complex) to memory\n\t"
-     :
-     :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
-     :"rax", "r8", "r9", "r10"
-     );
-
-
-  if(isodd) {
-    *result += input[num_points - 1] * taps[num_points - 1];
-  }
-
-  return;
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result,
+                                                       const lv_32fc_t* input,
+                                                       const lv_32fc_t* taps,
+                                                       unsigned int num_points)
+{
+
+    const unsigned int num_bytes = num_points * 8;
+    unsigned int isodd = num_points & 1;
+
+    __VOLK_ASM(
+        "#  ccomplex_dotprod_generic (float* result, const float *input,\n\t"
+        "#                         const float *taps, unsigned num_bytes)\n\t"
+        "#    float sum0 = 0;\n\t"
+        "#    float sum1 = 0;\n\t"
+        "#    float sum2 = 0;\n\t"
+        "#    float sum3 = 0;\n\t"
+        "#    do {\n\t"
+        "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+        "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+        "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+        "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+        "#      input += 4;\n\t"
+        "#      taps += 4;  \n\t"
+        "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
+        "#    result[0] = sum0 + sum2;\n\t"
+        "#    result[1] = sum1 + sum3;\n\t"
+        "# TODO: prefetch and better scheduling\n\t"
+        "  xor    %%r9,  %%r9\n\t"
+        "  xor    %%r10, %%r10\n\t"
+        "  movq   %%rcx, %%rax\n\t"
+        "  movq   %%rcx, %%r8\n\t"
+        "  movq   %[rsi],  %%r9\n\t"
+        "  movq   %[rdx], %%r10\n\t"
+        "      xorps   %%xmm6, %%xmm6          # zero accumulators\n\t"
+        "      movaps  0(%%r9), %%xmm0\n\t"
+        "      xorps   %%xmm7, %%xmm7          # zero accumulators\n\t"
+        "      movaps  0(%%r10), %%xmm2\n\t"
+        "      shr     $5, %%rax               # rax = n_2_ccomplex_blocks / 2\n\t"
+        "  shr     $4, %%r8\n\t"
+        "      jmp     .%=L1_test\n\t"
+        "      # 4 taps / loop\n\t"
+        "      # something like ?? cycles / loop\n\t"
+        ".%=Loop1:     \n\t"
+        "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+        "#     movaps  (%%r9), %%xmmA\n\t"
+        "#     movaps  (%%r10), %%xmmB\n\t"
+        "#     movaps  %%xmmA, %%xmmZ\n\t"
+        "#     shufps  $0xb1, %%xmmZ, %%xmmZ   # swap internals\n\t"
+        "#     mulps   %%xmmB, %%xmmA\n\t"
+        "#     mulps   %%xmmZ, %%xmmB\n\t"
+        "#     # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+        "#     xorps   %%xmmPN, %%xmmA\n\t"
+        "#     movaps  %%xmmA, %%xmmZ\n\t"
+        "#     unpcklps %%xmmB, %%xmmA\n\t"
+        "#     unpckhps %%xmmB, %%xmmZ\n\t"
+        "#     movaps  %%xmmZ, %%xmmY\n\t"
+        "#     shufps  $0x44, %%xmmA, %%xmmZ   # b01000100\n\t"
+        "#     shufps  $0xee, %%xmmY, %%xmmA   # b11101110\n\t"
+        "#     addps   %%xmmZ, %%xmmA\n\t"
+        "#     addps   %%xmmA, %%xmmC\n\t"
+        "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+        "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+        "      movaps  16(%%r9), %%xmm1\n\t"
+        "      movaps  %%xmm0, %%xmm4\n\t"
+        "      mulps   %%xmm2, %%xmm0\n\t"
+        "      shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
+        "      movaps  16(%%r10), %%xmm3\n\t"
+        "      movaps  %%xmm1, %%xmm5\n\t"
+        "      addps   %%xmm0, %%xmm6\n\t"
+        "      mulps   %%xmm3, %%xmm1\n\t"
+        "      shufps  $0xb1, %%xmm5, %%xmm5   # swap internals\n\t"
+        "      addps   %%xmm1, %%xmm6\n\t"
+        "      mulps   %%xmm4, %%xmm2\n\t"
+        "      movaps  32(%%r9), %%xmm0\n\t"
+        "      addps   %%xmm2, %%xmm7\n\t"
+        "      mulps   %%xmm5, %%xmm3\n\t"
+        "      add     $32, %%r9\n\t"
+        "      movaps  32(%%r10), %%xmm2\n\t"
+        "      addps   %%xmm3, %%xmm7\n\t"
+        "      add     $32, %%r10\n\t"
+        ".%=L1_test:\n\t"
+        "      dec     %%rax\n\t"
+        "      jge     .%=Loop1\n\t"
+        "      # We've handled the bulk of multiplies up to here.\n\t"
+        "      # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+        "      # If so, we've got 2 more taps to do.\n\t"
+        "      and     $1, %%r8\n\t"
+        "      je      .%=Leven\n\t"
+        "      # The count was odd, do 2 more taps.\n\t"
+        "      # Note that we've already got mm0/mm2 preloaded\n\t"
+        "      # from the main loop.\n\t"
+        "      movaps  %%xmm0, %%xmm4\n\t"
+        "      mulps   %%xmm2, %%xmm0\n\t"
+        "      shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
+        "      addps   %%xmm0, %%xmm6\n\t"
+        "      mulps   %%xmm4, %%xmm2\n\t"
+        "      addps   %%xmm2, %%xmm7\n\t"
+        ".%=Leven:\n\t"
+        "      # neg inversor\n\t"
+        "      xorps   %%xmm1, %%xmm1\n\t"
+        "      mov     $0x80000000, %%r9\n\t"
+        "      movd    %%r9, %%xmm1\n\t"
+        "      shufps  $0x11, %%xmm1, %%xmm1   # b00010001 # 0 -0 0 -0\n\t"
+        "      # pfpnacc\n\t"
+        "      xorps   %%xmm1, %%xmm6\n\t"
+        "      movaps  %%xmm6, %%xmm2\n\t"
+        "      unpcklps %%xmm7, %%xmm6\n\t"
+        "      unpckhps %%xmm7, %%xmm2\n\t"
+        "      movaps  %%xmm2, %%xmm3\n\t"
+        "      shufps  $0x44, %%xmm6, %%xmm2   # b01000100\n\t"
+        "      shufps  $0xee, %%xmm3, %%xmm6   # b11101110\n\t"
+        "      addps   %%xmm2, %%xmm6\n\t"
+        "                                      # xmm6 = r1 i2 r3 i4\n\t"
+        "      movhlps %%xmm6, %%xmm4          # xmm4 = r3 i4 ?? ??\n\t"
+        "      addps   %%xmm4, %%xmm6          # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+        "      movlps  %%xmm6, (%[rdi])                # store low 2x32 bits (complex) "
+        "to memory\n\t"
+        :
+        : [rsi] "r"(input), [rdx] "r"(taps), "c"(num_bytes), [rdi] "r"(result)
+        : "rax", "r8", "r9", "r10");
+
+
+    if (isodd) {
+        *result += input[num_points - 1] * taps[num_points - 1];
+    }
  
+    return;
  }
  
  #endif
  
  #if LV_HAVE_SSE && LV_HAVE_32
  
-static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result,
+                                                       const lv_32fc_t* input,
+                                                       const lv_32fc_t* taps,
+                                                       unsigned int num_points)
+{
  
-  volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points);
+    volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points);
  
  #if 0
    const unsigned int num_bytes = num_points*8;
@@ -792,57 +832,64 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const
  
  #include <pmmintrin.h>
  
-static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result,
+                                                     const lv_32fc_t* input,
+                                                     const lv_32fc_t* taps,
+                                                     unsigned int num_points)
+{
  
-  const unsigned int num_bytes = num_points*8;
-  unsigned int isodd = num_points & 1;
+    const unsigned int num_bytes = num_points * 8;
+    unsigned int isodd = num_points & 1;
  
-  lv_32fc_t dotProduct;
-  memset(&dotProduct, 0x0, 2*sizeof(float));
+    lv_32fc_t dotProduct;
+    memset(&dotProduct, 0x0, 2 * sizeof(float));
  
-  unsigned int number = 0;
-  const unsigned int halfPoints = num_bytes >> 4;
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_bytes >> 4;
  
-  __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+    __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
  
-  const lv_32fc_t* a = input;
-  const lv_32fc_t* b = taps;
+    const lv_32fc_t* a = input;
+    const lv_32fc_t* b = taps;
  
-  dotProdVal = _mm_setzero_ps();
+    dotProdVal = _mm_setzero_ps();
  
-  for(;number < halfPoints; number++){
+    for (; number < halfPoints; number++) {
  
-    x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-    y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+        x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+        y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
  
-    yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
-    yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+        yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+        yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
  
-    tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+        tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
  
-    x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+        x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
  
-    tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+        tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
  
-    z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+        z = _mm_addsub_ps(tmp1,
+                          tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
  
-    dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+        dotProdVal =
+            _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
  
-    a += 2;
-    b += 2;
-  }
+        a += 2;
+        b += 2;
+    }
  
-  __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
+    __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
  
-  _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+    _mm_store_ps((float*)dotProductVector,
+                 dotProdVal); // Store the results back into the dot product vector
  
-  dotProduct += ( dotProductVector[0] + dotProductVector[1] );
+    dotProduct += (dotProductVector[0] + dotProductVector[1]);
  
-  if(isodd) {
-    dotProduct += input[num_points - 1] * taps[num_points - 1];
-  }
+    if (isodd) {
+        dotProduct += input[num_points - 1] * taps[num_points - 1];
+    }
  
-  *result = dotProduct;
+    *result = dotProduct;
  }
  
  #endif /*LV_HAVE_SSE3*/
@@ -852,78 +899,82 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv
  
  #include <smmintrin.h>
  
-static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result,
+                                                       const lv_32fc_t* input,
+                                                       const lv_32fc_t* taps,
+                                                       unsigned int num_points)
+{
  
-  unsigned int i = 0;
-  const unsigned int qtr_points = num_points/4;
-  const unsigned int isodd = num_points & 3;
+    unsigned int i = 0;
+    const unsigned int qtr_points = num_points / 4;
+    const unsigned int isodd = num_points & 3;
  
-  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
-  float *p_input, *p_taps;
-  __m64 *p_result;
+    __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
+    float *p_input, *p_taps;
+    __m64* p_result;
  
-  static const __m128i neg = {0x000000000000000080000000};
+    static const __m128i neg = { 0x000000000000000080000000 };
  
-  p_result = (__m64*)result;
-  p_input = (float*)input;
-  p_taps = (float*)taps;
+    p_result = (__m64*)result;
+    p_input = (float*)input;
+    p_taps = (float*)taps;
  
-  real0 = _mm_setzero_ps();
-  real1 = _mm_setzero_ps();
-  im0 = _mm_setzero_ps();
-  im1 = _mm_setzero_ps();
+    real0 = _mm_setzero_ps();
+    real1 = _mm_setzero_ps();
+    im0 = _mm_setzero_ps();
+    im1 = _mm_setzero_ps();
  
-  for(; i < qtr_points; ++i) {
-    xmm0 = _mm_load_ps(p_input);
-    xmm1 = _mm_load_ps(p_taps);
+    for (; i < qtr_points; ++i) {
+        xmm0 = _mm_load_ps(p_input);
+        xmm1 = _mm_load_ps(p_taps);
  
-    p_input += 4;
-    p_taps += 4;
+        p_input += 4;
+        p_taps += 4;
  
-    xmm2 = _mm_load_ps(p_input);
-    xmm3 = _mm_load_ps(p_taps);
+        xmm2 = _mm_load_ps(p_input);
+        xmm3 = _mm_load_ps(p_taps);
  
-    p_input += 4;
-    p_taps += 4;
+        p_input += 4;
+        p_taps += 4;
  
-    xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
-    xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
-    xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
-    xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
+        xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
+        xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
+        xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
+        xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
  
-    //imaginary vector from input
-    xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
-    //real vector from input
-    xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
-    //imaginary vector from taps
-    xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
-    //real vector from taps
-    xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
+        // imaginary vector from input
+        xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
+        // real vector from input
+        xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
+        // imaginary vector from taps
+        xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
+        // real vector from taps
+        xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
  
-    xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
-    xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
+        xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
+        xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
  
-    xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
-    xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
+        xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
+        xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
  
-    real0 = _mm_add_ps(xmm4, real0);
-    real1 = _mm_add_ps(xmm5, real1);
-    im0 = _mm_add_ps(xmm6, im0);
-    im1 = _mm_add_ps(xmm7, im1);
-  }
+        real0 = _mm_add_ps(xmm4, real0);
+        real1 = _mm_add_ps(xmm5, real1);
+        im0 = _mm_add_ps(xmm6, im0);
+        im1 = _mm_add_ps(xmm7, im1);
+    }
  
-  real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
+    real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
  
-  im0 = _mm_add_ps(im0, im1);
-  real0 = _mm_add_ps(real0, real1);
+    im0 = _mm_add_ps(im0, im1);
+    real0 = _mm_add_ps(real0, real1);
  
-  im0 = _mm_add_ps(im0, real0);
+    im0 = _mm_add_ps(im0, real0);
  
-  _mm_storel_pi(p_result, im0);
+    _mm_storel_pi(p_result, im0);
  
-  for(i = num_points-isodd; i < num_points; i++) {
-    *result += input[i] * taps[i];
-  }
+    for (i = num_points - isodd; i < num_points; i++) {
+        *result += input[i] * taps[i];
+    }
  }
  
  #endif /*LV_HAVE_SSE4_1*/
@@ -931,13 +982,17 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result,
+                                                   const lv_32fc_t* input,
+                                                   const lv_32fc_t* taps,
+                                                   unsigned int num_points)
+{
  
      unsigned int quarter_points = num_points / 4;
      unsigned int number;
  
-    lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
-    lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+    lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
+    lv_32fc_t* b_ptr = (lv_32fc_t*)input;
      // for 2-lane vectors, 1st lane holds the real part,
      // 2nd lane holds the imaginary part
      float32x4x2_t a_val, b_val, c_val, accumulator;
@@ -945,11 +1000,11 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_3
      accumulator.val[0] = vdupq_n_f32(0);
      accumulator.val[1] = vdupq_n_f32(0);
  
-    for(number = 0; number < quarter_points; ++number) {
+    for (number = 0; number < quarter_points; ++number) {
          a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
          b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-        __VOLK_PREFETCH(a_ptr+8);
-        __VOLK_PREFETCH(b_ptr+8);
+        __VOLK_PREFETCH(a_ptr + 8);
+        __VOLK_PREFETCH(b_ptr + 8);
  
          // multiply the real*real and imag*imag to get real result
          // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
@@ -977,22 +1032,25 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_3
      *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
  
      // tail case
-    for(number = quarter_points*4; number < num_points; ++number) {
+    for (number = quarter_points * 4; number < num_points; ++number) {
          *result += (*a_ptr++) * (*b_ptr++);
      }
-
  }
  #endif /*LV_HAVE_NEON*/
  
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
-static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result,
+                                                            const lv_32fc_t* input,
+                                                            const lv_32fc_t* taps,
+                                                            unsigned int num_points)
+{
  
      unsigned int quarter_points = num_points / 4;
      unsigned int number;
  
-    lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
-    lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+    lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
+    lv_32fc_t* b_ptr = (lv_32fc_t*)input;
      // for 2-lane vectors, 1st lane holds the real part,
      // 2nd lane holds the imaginary part
      float32x4x2_t a_val, b_val, accumulator;
@@ -1000,11 +1058,11 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, c
      accumulator.val[0] = vdupq_n_f32(0);
      accumulator.val[1] = vdupq_n_f32(0);
  
-    for(number = 0; number < quarter_points; ++number) {
+    for (number = 0; number < quarter_points; ++number) {
          a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
          b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-        __VOLK_PREFETCH(a_ptr+8);
-        __VOLK_PREFETCH(b_ptr+8);
+        __VOLK_PREFETCH(a_ptr + 8);
+        __VOLK_PREFETCH(b_ptr + 8);
  
          // do the first multiply
          tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
@@ -1026,21 +1084,24 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, c
      *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
  
      // tail case
-    for(number = quarter_points*4; number < num_points; ++number) {
+    for (number = quarter_points * 4; number < num_points; ++number) {
          *result += (*a_ptr++) * (*b_ptr++);
      }
-
  }
  #endif /*LV_HAVE_NEON*/
  
  #ifdef LV_HAVE_NEON
-static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result,
+                                                          const lv_32fc_t* input,
+                                                          const lv_32fc_t* taps,
+                                                          unsigned int num_points)
+{
  
      unsigned int quarter_points = num_points / 4;
      unsigned int number;
  
-    lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
-    lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+    lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
+    lv_32fc_t* b_ptr = (lv_32fc_t*)input;
      // for 2-lane vectors, 1st lane holds the real part,
      // 2nd lane holds the imaginary part
      float32x4x2_t a_val, b_val, accumulator1, accumulator2;
@@ -1049,11 +1110,11 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, con
      accumulator2.val[0] = vdupq_n_f32(0);
      accumulator2.val[1] = vdupq_n_f32(0);
  
-    for(number = 0; number < quarter_points; ++number) {
+    for (number = 0; number < quarter_points; ++number) {
          a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
          b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-        __VOLK_PREFETCH(a_ptr+8);
-        __VOLK_PREFETCH(b_ptr+8);
+        __VOLK_PREFETCH(a_ptr + 8);
+        __VOLK_PREFETCH(b_ptr + 8);
  
          // use 2 accumulators to remove inter-instruction data dependencies
          accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
@@ -1071,22 +1132,26 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, con
      *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
  
      // tail case
-    for(number = quarter_points*4; number < num_points; ++number) {
+    for (number = quarter_points * 4; number < num_points; ++number) {
          *result += (*a_ptr++) * (*b_ptr++);
      }
-
  }
  #endif /*LV_HAVE_NEON*/
  
  #ifdef LV_HAVE_NEON
-static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
-// NOTE: GCC does a poor job with this kernel, but the equivalent ASM code is very fast
+static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* result,
+                                                                const lv_32fc_t* input,
+                                                                const lv_32fc_t* taps,
+                                                                unsigned int num_points)
+{
+    // NOTE: GCC does a poor job with this kernel, but the equivalent ASM code is very
+    // fast
  
      unsigned int quarter_points = num_points / 8;
      unsigned int number;
  
-    lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
-    lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+    lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
+    lv_32fc_t* b_ptr = (lv_32fc_t*)input;
      // for 2-lane vectors, 1st lane holds the real part,
      // 2nd lane holds the imaginary part
      float32x4x4_t a_val, b_val, accumulator1, accumulator2;
@@ -1101,11 +1166,11 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* resul
      accumulator2.val[3] = vdupq_n_f32(0);
  
      // 8 input regs, 8 accumulators -> 16/16 neon regs are used
-    for(number = 0; number < quarter_points; ++number) {
+    for (number = 0; number < quarter_points; ++number) {
          a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
          b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-        __VOLK_PREFETCH(a_ptr+8);
-        __VOLK_PREFETCH(b_ptr+8);
+        __VOLK_PREFETCH(a_ptr + 8);
+        __VOLK_PREFETCH(b_ptr + 8);
  
          // use 2 accumulators to remove inter-instruction data dependencies
          accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
@@ -1136,10 +1201,9 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* resul
      *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
  
      // tail case
-    for(number = quarter_points*8; number < num_points; ++number) {
+    for (number = quarter_points * 8; number < num_points; ++number) {
          *result += (*a_ptr++) * (*b_ptr++);
      }
-
  }
  #endif /*LV_HAVE_NEON*/
  
@@ -1148,56 +1212,64 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* resul
  
  #include <immintrin.h>
  
-static inline void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t* result,
+                                                    const lv_32fc_t* input,
+                                                    const lv_32fc_t* taps,
+                                                    unsigned int num_points)
+{
  
-  unsigned int isodd = num_points & 3;
-  unsigned int i = 0;
-  lv_32fc_t dotProduct;
-  memset(&dotProduct, 0x0, 2*sizeof(float));
+    unsigned int isodd = num_points & 3;
+    unsigned int i = 0;
+    lv_32fc_t dotProduct;
+    memset(&dotProduct, 0x0, 2 * sizeof(float));
  
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+    __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
  
-  const lv_32fc_t* a = input;
-  const lv_32fc_t* b = taps;
+    const lv_32fc_t* a = input;
+    const lv_32fc_t* b = taps;
  
-  dotProdVal = _mm256_setzero_ps();
+    dotProdVal = _mm256_setzero_ps();
  
-  for(;number < quarterPoints; number++){
+    for (; number < quarterPoints; number++) {
  
-    x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
-    y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
+        x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
+        y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
  
-    yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
-    yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
+        yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
+        yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
  
-    tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+        tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
  
-    x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
+        x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
  
-    tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+        tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
  
-    z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+        z = _mm256_addsub_ps(tmp1,
+                             tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
  
-    dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex multiplication results together
+        dotProdVal = _mm256_add_ps(dotProdVal,
+                                   z); // Add the complex multiplication results together
  
-    a += 4;
-    b += 4;
-  }
+        a += 4;
+        b += 4;
+    }
  
-  __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
+    __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
  
-  _mm256_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+    _mm256_store_ps((float*)dotProductVector,
+                    dotProdVal); // Store the results back into the dot product vector
  
-  dotProduct += ( dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + dotProductVector[3]);
+    dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
+                   dotProductVector[3]);
  
-  for(i = num_points-isodd; i < num_points; i++) {
-    dotProduct += input[i] * taps[i];
-  }
+    for (i = num_points - isodd; i < num_points; i++) {
+        dotProduct += input[i] * taps[i];
+    }
  
-  *result = dotProduct;
+    *result = dotProduct;
  }
  
  #endif /*LV_HAVE_AVX*/
@@ -1205,56 +1277,64 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t* result, const lv_
  #if LV_HAVE_AVX && LV_HAVE_FMA
  #include <immintrin.h>
  
-static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(lv_32fc_t* result,
+                                                        const lv_32fc_t* input,
+                                                        const lv_32fc_t* taps,
+                                                        unsigned int num_points)
+{
  
-  unsigned int isodd = num_points & 3;
-  unsigned int i = 0;
-  lv_32fc_t dotProduct;
-  memset(&dotProduct, 0x0, 2*sizeof(float));
+    unsigned int isodd = num_points & 3;
+    unsigned int i = 0;
+    lv_32fc_t dotProduct;
+    memset(&dotProduct, 0x0, 2 * sizeof(float));
  
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+    __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
  
-  const lv_32fc_t* a = input;
-  const lv_32fc_t* b = taps;
+    const lv_32fc_t* a = input;
+    const lv_32fc_t* b = taps;
  
-  dotProdVal = _mm256_setzero_ps();
+    dotProdVal = _mm256_setzero_ps();
  
-  for(;number < quarterPoints; number++){
+    for (; number < quarterPoints; number++) {
  
-    x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
-    y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
+        x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
+        y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
  
-    yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
-    yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
+        yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
+        yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
  
-    tmp1 = x;
+        tmp1 = x;
  
-    x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
+        x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
  
-    tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+        tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
  
-    z = _mm256_fmaddsub_ps(tmp1, yl,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+        z = _mm256_fmaddsub_ps(
+            tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
  
-    dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex multiplication results together
+        dotProdVal = _mm256_add_ps(dotProdVal,
+                                   z); // Add the complex multiplication results together
  
-    a += 4;
-    b += 4;
-  }
+        a += 4;
+        b += 4;
+    }
  
-  __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
+    __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
  
-  _mm256_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+    _mm256_store_ps((float*)dotProductVector,
+                    dotProdVal); // Store the results back into the dot product vector
  
-  dotProduct += ( dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + dotProductVector[3]);
+    dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
+                   dotProductVector[3]);
  
-  for(i = num_points-isodd; i < num_points; i++) {
-    dotProduct += input[i] * taps[i];
-  }
+    for (i = num_points - isodd; i < num_points; i++) {
+        dotProduct += input[i] * taps[i];
+    }
  
-  *result = dotProduct;
+    *result = dotProduct;
  }
  
  #endif /*LV_HAVE_AVX && LV_HAVE_FMA*/
diff --git a/kernels/volk/volk_32fc_x2_multiply_32fc.h b/kernels/volk/volk_32fc_x2_multiply_32fc.h

index 6bf428b1ee701024a6a0350259e94522d0f72f57..6cb6907a81455adb9387d924ba8ee80e357995ba 100644 (file)
--- a/kernels/volk/volk_32fc_x2_multiply_32fc.h
+++ b/kernels/volk/volk_32fc_x2_multiply_32fc.h
@@ -29,8 +29,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_x2_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
- * \endcode
+ * void volk_32fc_x2_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
+ * lv_32fc_t* bVector, unsigned int num_points); \endcode
   *
   * \b Inputs
   * \li aVector: The first input vector of complex floats.
@@ -70,55 +70,62 @@
  #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
  #define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
  
+#include <float.h>
  #include <inttypes.h>
  #include <stdio.h>
  #include <volk/volk_complex.h>
-#include <float.h>
  
  #if LV_HAVE_AVX2 && LV_HAVE_FMA
  #include <immintrin.h>
  /*!
-  \brief Multiplies the two input complex vectors and stores their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be multiplied
-  \param bVector One of the vectors to be multiplied
-  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  \brief Multiplies the two input complex vectors and stores their results in the third
+  vector \param cVector The vector where the results will be stored \param aVector One of
+  the vectors to be multiplied \param bVector One of the vectors to be multiplied \param
+  num_points The number of complex values in aVector and bVector to be multiplied together
+  and stored into cVector
  */
-static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(lv_32fc_t* cVector,
+                                                         const lv_32fc_t* aVector,
+                                                         const lv_32fc_t* bVector,
+                                                         unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  lv_32fc_t* c = cVector;
-  const lv_32fc_t* a = aVector;
-  const lv_32fc_t* b = bVector;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
  
-  for(;number < quarterPoints; number++){
+    for (; number < quarterPoints; number++) {
  
-    const __m256 x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-    const __m256 y = _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+        const __m256 x =
+            _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+        const __m256 y =
+            _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
  
-    const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
-    const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+        const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+        const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
  
-    const __m256 tmp2x = _mm256_permute_ps(x,0xB1); // Re-arrange x to be ai,ar,bi,br
+        const __m256 tmp2x = _mm256_permute_ps(x, 0xB1); // Re-arrange x to be ai,ar,bi,br
  
-    const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+        const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
  
-    const __m256 z = _mm256_fmaddsub_ps(x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+        const __m256 z = _mm256_fmaddsub_ps(
+            x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
  
-    _mm256_storeu_ps((float*)c,z); // Store the results back into the C container
+        _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
  
-    a += 4;
-    b += 4;
-    c += 4;
-  }
+        a += 4;
+        b += 4;
+        c += 4;
+    }
  
-  _mm256_zeroupper();
+    _mm256_zeroupper();
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *c++ = (*a++) * (*b++);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *c++ = (*a++) * (*b++);
+    }
  }
  #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
  
@@ -127,34 +134,37 @@ static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(lv_32fc_t* cVector, con
  #include <immintrin.h>
  #include <volk/volk_avx_intrinsics.h>
  
-static inline void
-volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                 const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector,
+                                                    const lv_32fc_t* aVector,
+                                                    const lv_32fc_t* bVector,
+                                                    unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  __m256 x, y, z;
-  lv_32fc_t* c = cVector;
-  const lv_32fc_t* a = aVector;
-  const lv_32fc_t* b = bVector;
-
-  for(; number < quarterPoints; number++){
-    x = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
-    y = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
-    z = _mm256_complexmul_ps(x, y);
-    _mm256_storeu_ps((float*) c, z); // Store the results back into the C container
-
-    a += 4;
-    b += 4;
-    c += 4;
-  }
-
-  number = quarterPoints * 4;
-
-  for(; number < num_points; number++){
-    *c++ = (*a++) * (*b++);
-  }
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m256 x, y, z;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for (; number < quarterPoints; number++) {
+        x = _mm256_loadu_ps(
+            (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+        y = _mm256_loadu_ps(
+            (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+        z = _mm256_complexmul_ps(x, y);
+        _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
+
+        a += 4;
+        b += 4;
+        c += 4;
+    }
+
+    number = quarterPoints * 4;
+
+    for (; number < num_points; number++) {
+        *c++ = (*a++) * (*b++);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
@@ -163,50 +173,52 @@ volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
  #include <pmmintrin.h>
  #include <volk/volk_sse3_intrinsics.h>
  
-static inline void
-volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                  const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector,
+                                                     const lv_32fc_t* aVector,
+                                                     const lv_32fc_t* bVector,
+                                                     unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int halfPoints = num_points / 2;
-
-  __m128 x, y, z;
-  lv_32fc_t* c = cVector;
-  const lv_32fc_t* a = aVector;
-  const lv_32fc_t* b = bVector;
-
-  for(; number < halfPoints; number++){
-    x = _mm_loadu_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi
-    y = _mm_loadu_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di
-    z = _mm_complexmul_ps(x, y);
-    _mm_storeu_ps((float*) c, z); // Store the results back into the C container
-
-    a += 2;
-    b += 2;
-    c += 2;
-  }
-
-  if((num_points % 2) != 0){
-    *c = (*a) * (*b);
-  }
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, y, z;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for (; number < halfPoints; number++) {
+        x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+        y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+        z = _mm_complexmul_ps(x, y);
+        _mm_storeu_ps((float*)c, z); // Store the results back into the C container
+
+        a += 2;
+        b += 2;
+        c += 2;
+    }
+
+    if ((num_points % 2) != 0) {
+        *c = (*a) * (*b);
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                   const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector,
+                                                      const lv_32fc_t* aVector,
+                                                      const lv_32fc_t* bVector,
+                                                      unsigned int num_points)
  {
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  const lv_32fc_t* bPtr=  bVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = (*aPtr++) * (*bPtr++);
-  }
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -215,55 +227,62 @@ volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
  #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H
  #define INCLUDED_volk_32fc_x2_multiply_32fc_a_H
  
+#include <float.h>
  #include <inttypes.h>
  #include <stdio.h>
  #include <volk/volk_complex.h>
-#include <float.h>
  
  #if LV_HAVE_AVX2 && LV_HAVE_FMA
  #include <immintrin.h>
  /*!
-  \brief Multiplies the two input complex vectors and stores their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be multiplied
-  \param bVector One of the vectors to be multiplied
-  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  \brief Multiplies the two input complex vectors and stores their results in the third
+  vector \param cVector The vector where the results will be stored \param aVector One of
+  the vectors to be multiplied \param bVector One of the vectors to be multiplied \param
+  num_points The number of complex values in aVector and bVector to be multiplied together
+  and stored into cVector
  */
-static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(lv_32fc_t* cVector,
+                                                         const lv_32fc_t* aVector,
+                                                         const lv_32fc_t* bVector,
+                                                         unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  lv_32fc_t* c = cVector;
-  const lv_32fc_t* a = aVector;
-  const lv_32fc_t* b = bVector;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
  
-  for(;number < quarterPoints; number++){
+    for (; number < quarterPoints; number++) {
  
-    const __m256 x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-    const __m256 y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+        const __m256 x =
+            _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+        const __m256 y =
+            _mm256_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
  
-    const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
-    const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+        const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+        const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
  
-    const __m256 tmp2x = _mm256_permute_ps(x,0xB1); // Re-arrange x to be ai,ar,bi,br
+        const __m256 tmp2x = _mm256_permute_ps(x, 0xB1); // Re-arrange x to be ai,ar,bi,br
  
-    const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+        const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
  
-    const __m256 z = _mm256_fmaddsub_ps(x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+        const __m256 z = _mm256_fmaddsub_ps(
+            x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
  
-    _mm256_store_ps((float*)c,z); // Store the results back into the C container
+        _mm256_store_ps((float*)c, z); // Store the results back into the C container
  
-    a += 4;
-    b += 4;
-    c += 4;
-  }
+        a += 4;
+        b += 4;
+        c += 4;
+    }
  
-  _mm256_zeroupper();
+    _mm256_zeroupper();
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    *c++ = (*a++) * (*b++);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *c++ = (*a++) * (*b++);
+    }
  }
  #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
  
@@ -272,34 +291,35 @@ static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(lv_32fc_t* cVector, con
  #include <immintrin.h>
  #include <volk/volk_avx_intrinsics.h>
  
-static inline void
-volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                 const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector,
+                                                    const lv_32fc_t* aVector,
+                                                    const lv_32fc_t* bVector,
+                                                    unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  __m256 x, y, z;
-  lv_32fc_t* c = cVector;
-  const lv_32fc_t* a = aVector;
-  const lv_32fc_t* b = bVector;
-
-  for(; number < quarterPoints; number++){
-    x = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
-    y = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
-    z = _mm256_complexmul_ps(x, y);
-    _mm256_store_ps((float*) c, z); // Store the results back into the C container
-
-    a += 4;
-    b += 4;
-    c += 4;
-  }
-
-  number = quarterPoints * 4;
-
-  for(; number < num_points; number++){
-    *c++ = (*a++) * (*b++);
-  }
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m256 x, y, z;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for (; number < quarterPoints; number++) {
+        x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+        y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+        z = _mm256_complexmul_ps(x, y);
+        _mm256_store_ps((float*)c, z); // Store the results back into the C container
+
+        a += 4;
+        b += 4;
+        c += 4;
+    }
+
+    number = quarterPoints * 4;
+
+    for (; number < num_points; number++) {
+        *c++ = (*a++) * (*b++);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
@@ -307,50 +327,52 @@ volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
  #include <pmmintrin.h>
  #include <volk/volk_sse3_intrinsics.h>
  
-static inline void
-volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                  const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector,
+                                                     const lv_32fc_t* aVector,
+                                                     const lv_32fc_t* bVector,
+                                                     unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int halfPoints = num_points / 2;
-
-  __m128 x, y, z;
-  lv_32fc_t* c = cVector;
-  const lv_32fc_t* a = aVector;
-  const lv_32fc_t* b = bVector;
-
-  for(; number < halfPoints; number++){
-    x = _mm_load_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi
-    y = _mm_load_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di
-    z = _mm_complexmul_ps(x, y);
-    _mm_store_ps((float*) c, z); // Store the results back into the C container
-
-    a += 2;
-    b += 2;
-    c += 2;
-  }
-
-  if((num_points % 2) != 0){
-    *c = (*a) * (*b);
-  }
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, y, z;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for (; number < halfPoints; number++) {
+        x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+        y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+        z = _mm_complexmul_ps(x, y);
+        _mm_store_ps((float*)c, z); // Store the results back into the C container
+
+        a += 2;
+        b += 2;
+        c += 2;
+    }
+
+    if ((num_points % 2) != 0) {
+        *c = (*a) * (*b);
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                     const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector,
+                                                        const lv_32fc_t* aVector,
+                                                        const lv_32fc_t* bVector,
+                                                        unsigned int num_points)
  {
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  const lv_32fc_t* bPtr=  bVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = (*aPtr++) * (*bPtr++);
-  }
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -358,113 +380,118 @@ volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVecto
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector,
+                                                   const lv_32fc_t* aVector,
+                                                   const lv_32fc_t* bVector,
+                                                   unsigned int num_points)
  {
-  lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
-  lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
-  unsigned int quarter_points = num_points / 4;
-  float32x4x2_t a_val, b_val, c_val;
-  float32x4x2_t tmp_real, tmp_imag;
-  unsigned int number = 0;
-
-  for(number = 0; number < quarter_points; ++number) {
-    a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
-    b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-    __VOLK_PREFETCH(a_ptr+4);
-    __VOLK_PREFETCH(b_ptr+4);
-
-    // multiply the real*real and imag*imag to get real result
-    // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
-    tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
-    // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
-    tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
-
-    // Multiply cross terms to get the imaginary result
-    // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
-    tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
-    // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
-    tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
-
-    // store the results
-    c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
-    c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
-    vst2q_f32((float*)cVector, c_val);
-
-    a_ptr += 4;
-    b_ptr += 4;
-    cVector += 4;
-  }
-
-  for(number = quarter_points*4; number < num_points; number++){
-    *cVector++ = (*a_ptr++) * (*b_ptr++);
-  }
+    lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
+    lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
+    unsigned int quarter_points = num_points / 4;
+    float32x4x2_t a_val, b_val, c_val;
+    float32x4x2_t tmp_real, tmp_imag;
+    unsigned int number = 0;
+
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        __VOLK_PREFETCH(a_ptr + 4);
+        __VOLK_PREFETCH(b_ptr + 4);
+
+        // multiply the real*real and imag*imag to get real result
+        // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+        tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+        // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+        tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
+
+        // Multiply cross terms to get the imaginary result
+        // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+        tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
+        // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+        tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+
+        // store the results
+        c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
+        c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
+        vst2q_f32((float*)cVector, c_val);
+
+        a_ptr += 4;
+        b_ptr += 4;
+        cVector += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *cVector++ = (*a_ptr++) * (*b_ptr++);
+    }
  }
  #endif /* LV_HAVE_NEON */
  
  
  #ifdef LV_HAVE_NEON
  
-static inline void
-volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                         const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector,
+                                                            const lv_32fc_t* aVector,
+                                                            const lv_32fc_t* bVector,
+                                                            unsigned int num_points)
  {
-  lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
-  lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
-  unsigned int quarter_points = num_points / 4;
-  float32x4x2_t a_val, b_val;
-  float32x4x2_t tmp_imag;
-  unsigned int number = 0;
-
-  for(number = 0; number < quarter_points; ++number) {
-    a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
-    b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-    __VOLK_PREFETCH(a_ptr+4);
-    __VOLK_PREFETCH(b_ptr+4);
-
-    // do the first multiply
-    tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
-    tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
-
-    // use multiply accumulate/subtract to get result
-    tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
-    tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
-
-    // store
-    vst2q_f32((float*)cVector, tmp_imag);
-    // increment pointers
-    a_ptr += 4;
-    b_ptr += 4;
-    cVector += 4;
-  }
-
-  for(number = quarter_points*4; number < num_points; number++){
-    *cVector++ = (*a_ptr++) * (*b_ptr++);
-  }
+    lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
+    lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
+    unsigned int quarter_points = num_points / 4;
+    float32x4x2_t a_val, b_val;
+    float32x4x2_t tmp_imag;
+    unsigned int number = 0;
+
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        __VOLK_PREFETCH(a_ptr + 4);
+        __VOLK_PREFETCH(b_ptr + 4);
+
+        // do the first multiply
+        tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+        tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+
+        // use multiply accumulate/subtract to get result
+        tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
+        tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
+
+        // store
+        vst2q_f32((float*)cVector, tmp_imag);
+        // increment pointers
+        a_ptr += 4;
+        b_ptr += 4;
+        cVector += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *cVector++ = (*a_ptr++) * (*b_ptr++);
+    }
  }
  #endif /* LV_HAVE_NEON */
  
  
  #ifdef LV_HAVE_NEONV7
  
-extern void
-volk_32fc_x2_multiply_32fc_a_neonasm(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                   const lv_32fc_t* bVector, unsigned int num_points);
+extern void volk_32fc_x2_multiply_32fc_a_neonasm(lv_32fc_t* cVector,
+                                                 const lv_32fc_t* aVector,
+                                                 const lv_32fc_t* bVector,
+                                                 unsigned int num_points);
  #endif /* LV_HAVE_NEONV7 */
  
  
  #ifdef LV_HAVE_ORC
  
-extern void
-volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                      const lv_32fc_t* bVector, unsigned int num_points);
+extern void volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector,
+                                                  const lv_32fc_t* aVector,
+                                                  const lv_32fc_t* bVector,
+                                                  unsigned int num_points);
  
-static inline void
-volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                 const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector,
+                                                    const lv_32fc_t* aVector,
+                                                    const lv_32fc_t* bVector,
+                                                    unsigned int num_points)
  {
-  volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
+    volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
  }
  
  #endif /* LV_HAVE_ORC */
diff --git a/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h b/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h

index 1b1a8b3193aa0558ece14b05a0e3d2bc9a6e59ac..4f834c24574d602f87d7804c95fbd8e48c32dacc 100644 (file)
--- a/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
+++ b/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_x2_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
- * \endcode
+ * void volk_32fc_x2_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ * const lv_32fc_t* bVector, unsigned int num_points); \endcode
   *
   * \b Inputs
   * \li aVector: The first input vector of complex floats.
@@ -71,43 +71,46 @@
  #ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
  #define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
  
+#include <float.h>
  #include <inttypes.h>
  #include <stdio.h>
  #include <volk/volk_complex.h>
-#include <float.h>
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  #include <volk/volk_avx_intrinsics.h>
  
-static inline void
-volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                           const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t* cVector,
+                                                              const lv_32fc_t* aVector,
+                                                              const lv_32fc_t* bVector,
+                                                              unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  __m256 x, y, z;
-  lv_32fc_t* c = cVector;
-  const lv_32fc_t* a = aVector;
-  const lv_32fc_t* b = bVector;
-
-  for(; number < quarterPoints; number++){
-    x = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
-    y = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
-    z = _mm256_complexconjugatemul_ps(x, y);
-    _mm256_storeu_ps((float*) c, z); // Store the results back into the C container
-
-    a += 4;
-    b += 4;
-    c += 4;
-  }
-
-  number = quarterPoints * 4;
-
-  for(; number < num_points; number++){
-    *c++ = (*a++) * lv_conj(*b++);
-  }
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m256 x, y, z;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for (; number < quarterPoints; number++) {
+        x = _mm256_loadu_ps(
+            (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+        y = _mm256_loadu_ps(
+            (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+        z = _mm256_complexconjugatemul_ps(x, y);
+        _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
+
+        a += 4;
+        b += 4;
+        c += 4;
+    }
+
+    number = quarterPoints * 4;
+
+    for (; number < num_points; number++) {
+        *c++ = (*a++) * lv_conj(*b++);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
@@ -116,96 +119,98 @@ volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t*
  #include <pmmintrin.h>
  #include <volk/volk_sse3_intrinsics.h>
  
-static inline void
-volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                            const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector,
+                                                               const lv_32fc_t* aVector,
+                                                               const lv_32fc_t* bVector,
+                                                               unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int halfPoints = num_points / 2;
-
-  __m128 x, y, z;
-  lv_32fc_t* c = cVector;
-  const lv_32fc_t* a = aVector;
-  const lv_32fc_t* b = bVector;
-
-  for(; number < halfPoints; number++){
-    x = _mm_loadu_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi
-    y = _mm_loadu_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di
-    z = _mm_complexconjugatemul_ps(x, y);
-    _mm_storeu_ps((float*) c, z); // Store the results back into the C container
-
-    a += 2;
-    b += 2;
-    c += 2;
-  }
-
-  if((num_points % 2) != 0){
-    *c = (*a) * lv_conj(*b);
-  }
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, y, z;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for (; number < halfPoints; number++) {
+        x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+        y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+        z = _mm_complexconjugatemul_ps(x, y);
+        _mm_storeu_ps((float*)c, z); // Store the results back into the C container
+
+        a += 2;
+        b += 2;
+        c += 2;
+    }
+
+    if ((num_points % 2) != 0) {
+        *c = (*a) * lv_conj(*b);
+    }
  }
  #endif /* LV_HAVE_SSE */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                             const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t* cVector,
+                                                                const lv_32fc_t* aVector,
+                                                                const lv_32fc_t* bVector,
+                                                                unsigned int num_points)
  {
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  const lv_32fc_t* bPtr=  bVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
-  }
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
-
  #endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */
  #ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
  #define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
  
+#include <float.h>
  #include <inttypes.h>
  #include <stdio.h>
  #include <volk/volk_complex.h>
-#include <float.h>
  
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  #include <volk/volk_avx_intrinsics.h>
  
-static inline void
-volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                           const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t* cVector,
+                                                              const lv_32fc_t* aVector,
+                                                              const lv_32fc_t* bVector,
+                                                              unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  __m256 x, y, z;
-  lv_32fc_t* c = cVector;
-  const lv_32fc_t* a = aVector;
-  const lv_32fc_t* b = bVector;
-
-  for(; number < quarterPoints; number++){
-    x = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
-    y = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
-    z = _mm256_complexconjugatemul_ps(x, y);
-    _mm256_store_ps((float*) c, z); // Store the results back into the C container
-
-    a += 4;
-    b += 4;
-    c += 4;
-  }
-
-  number = quarterPoints * 4;
-
-  for(; number < num_points; number++){
-    *c++ = (*a++) * lv_conj(*b++);
-  }
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m256 x, y, z;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for (; number < quarterPoints; number++) {
+        x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+        y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+        z = _mm256_complexconjugatemul_ps(x, y);
+        _mm256_store_ps((float*)c, z); // Store the results back into the C container
+
+        a += 4;
+        b += 4;
+        c += 4;
+    }
+
+    number = quarterPoints * 4;
+
+    for (; number < num_points; number++) {
+        *c++ = (*a++) * lv_conj(*b++);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
@@ -214,32 +219,33 @@ volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t*
  #include <pmmintrin.h>
  #include <volk/volk_sse3_intrinsics.h>
  
-static inline void
-volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                            const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector,
+                                                               const lv_32fc_t* aVector,
+                                                               const lv_32fc_t* bVector,
+                                                               unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int halfPoints = num_points / 2;
-
-  __m128 x, y, z;
-  lv_32fc_t* c = cVector;
-  const lv_32fc_t* a = aVector;
-  const lv_32fc_t* b = bVector;
-
-  for(; number < halfPoints; number++){
-    x = _mm_load_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi
-    y = _mm_load_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di
-    z = _mm_complexconjugatemul_ps(x, y);
-    _mm_store_ps((float*) c, z); // Store the results back into the C container
-
-    a += 2;
-    b += 2;
-    c += 2;
-  }
-
-  if((num_points % 2) != 0){
-    *c = (*a) * lv_conj(*b);
-  }
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, y, z;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for (; number < halfPoints; number++) {
+        x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+        y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+        z = _mm_complexconjugatemul_ps(x, y);
+        _mm_store_ps((float*)c, z); // Store the results back into the C container
+
+        a += 2;
+        b += 2;
+        c += 2;
+    }
+
+    if ((num_points % 2) != 0) {
+        *c = (*a) * lv_conj(*b);
+    }
  }
  #endif /* LV_HAVE_SSE */
  
@@ -247,49 +253,50 @@ volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t*
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                          const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector,
+                                                             const lv_32fc_t* aVector,
+                                                             const lv_32fc_t* bVector,
+                                                             unsigned int num_points)
  {
-  lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
-  lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
-  unsigned int quarter_points = num_points / 4;
-  float32x4x2_t a_val, b_val, c_val;
-  float32x4x2_t tmp_real, tmp_imag;
-  unsigned int number = 0;
-
-  for(number = 0; number < quarter_points; ++number) {
-    a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
-    b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-    b_val.val[1] = vnegq_f32(b_val.val[1]);
-    __VOLK_PREFETCH(a_ptr+4);
-    __VOLK_PREFETCH(b_ptr+4);
-
-    // multiply the real*real and imag*imag to get real result
-    // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
-    tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
-    // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
-    tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
-
-    // Multiply cross terms to get the imaginary result
+    lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
+    lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
+    unsigned int quarter_points = num_points / 4;
+    float32x4x2_t a_val, b_val, c_val;
+    float32x4x2_t tmp_real, tmp_imag;
+    unsigned int number = 0;
+
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        b_val.val[1] = vnegq_f32(b_val.val[1]);
+        __VOLK_PREFETCH(a_ptr + 4);
+        __VOLK_PREFETCH(b_ptr + 4);
+
+        // multiply the real*real and imag*imag to get real result
+        // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+        tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+        // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+        tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
+
+        // Multiply cross terms to get the imaginary result
          // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
-    tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
-    // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
-    tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
-
-    // store the results
-    c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
-    c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
-    vst2q_f32((float*)cVector, c_val);
-
-    a_ptr += 4;
-    b_ptr += 4;
-    cVector += 4;
+        tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
+        // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+        tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+
+        // store the results
+        c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
+        c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
+        vst2q_f32((float*)cVector, c_val);
+
+        a_ptr += 4;
+        b_ptr += 4;
+        cVector += 4;
      }
  
-  for(number = quarter_points*4; number < num_points; number++){
-    *cVector++ = (*a_ptr++) * conj(*b_ptr++);
-  }
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *cVector++ = (*a_ptr++) * conj(*b_ptr++);
+    }
  }
  #endif /* LV_HAVE_NEON */
  
@@ -297,17 +304,19 @@ volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* a
  #ifdef LV_HAVE_GENERIC
  
  static inline void
-volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-                                               const lv_32fc_t* bVector, unsigned int num_points)
+volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector,
+                                               const lv_32fc_t* aVector,
+                                               const lv_32fc_t* bVector,
+                                               unsigned int num_points)
  {
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  const lv_32fc_t* bPtr=  bVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
-  }
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
diff --git a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h

index 1c65f23e2e7ed8adc178d1091b60f2daab1b1093..1d10561f5a998c486f21c716432ed292eeeb17c2 100644 (file)
--- a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
+++ b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_x2_s32f_square_dist_scalar_mult_32f(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points)
- * \endcode
+ * void volk_32fc_x2_s32f_square_dist_scalar_mult_32f(float* target, lv_32fc_t* src0,
+ * lv_32fc_t* points, float scalar, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li src0: The complex input. Only the first point is used.
@@ -79,103 +79,107 @@
  #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
  #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
  
-#include<volk/volk_complex.h>
+#include <volk/volk_complex.h>
  
  
-static inline void
-calculate_scaled_distances(float* target, const lv_32fc_t symbol, const lv_32fc_t* points,
-                           const float scalar, const unsigned int num_points)
+static inline void calculate_scaled_distances(float* target,
+                                              const lv_32fc_t symbol,
+                                              const lv_32fc_t* points,
+                                              const float scalar,
+                                              const unsigned int num_points)
  {
-  lv_32fc_t diff;
-  for(unsigned int i = 0; i < num_points; ++i) {
-    /*
-     * Calculate: |y - x|^2 * SNR_lin
-     * Compare C++: *target++ = scalar * std::norm(symbol - *constellation++);
-     */
-    diff = symbol - *points++;
-    *target++ = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
-  }
+    lv_32fc_t diff;
+    for (unsigned int i = 0; i < num_points; ++i) {
+        /*
+         * Calculate: |y - x|^2 * SNR_lin
+         * Compare C++: *target++ = scalar * std::norm(symbol - *constellation++);
+         */
+        diff = symbol - *points++;
+        *target++ =
+            scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
+    }
  }
  
  
  #ifdef LV_HAVE_AVX2
-#include<immintrin.h>
-#include<volk/volk_avx2_intrinsics.h>
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
  
  static inline void
-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target, lv_32fc_t* src0, 
-                                                     lv_32fc_t* points, float scalar, 
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target,
+                                                     lv_32fc_t* src0,
+                                                     lv_32fc_t* points,
+                                                     float scalar,
                                                       unsigned int num_points)
  {
-  const unsigned int num_bytes = num_points*8;
-  __m128 xmm9, xmm10;
-  __m256 xmm4, xmm6;
-  __m256 xmm_points0, xmm_points1, xmm_result;
+    const unsigned int num_bytes = num_points * 8;
+    __m128 xmm9, xmm10;
+    __m256 xmm4, xmm6;
+    __m256 xmm_points0, xmm_points1, xmm_result;
  
-  const unsigned int bound = num_bytes >> 6;
-  
-  // load complex value into all parts of the register.
-  const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
-  const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
-  
-  // Load scalar into all 8 parts of the register
-  const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
-  const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
+    const unsigned int bound = num_bytes >> 6;
  
-  // Set permutation constant
-  const __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
-  
-  for(unsigned int i = 0; i < bound; ++i) {
-    xmm_points0 = _mm256_load_ps((float*)points);
-    xmm_points1 = _mm256_load_ps((float*)(points + 4));
-    points += 8;
-    __VOLK_PREFETCH(points);
+    // load complex value into all parts of the register.
+    const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
+    const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
  
-    xmm_result = _mm256_scaled_norm_dist_ps_avx2(xmm_symbol, xmm_symbol,
-                                                 xmm_points0, xmm_points1, 
-                                                 xmm_scalar);
-    
-    _mm256_store_ps(target, xmm_result);
-    target += 8;
-  }
+    // Load scalar into all 8 parts of the register
+    const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
+    const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
  
-  if (num_bytes >> 5 & 1) {
-    xmm_points0 = _mm256_load_ps((float*)points);
+    // Set permutation constant
+    const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
  
-    xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
+    for (unsigned int i = 0; i < bound; ++i) {
+        xmm_points0 = _mm256_load_ps((float*)points);
+        xmm_points1 = _mm256_load_ps((float*)(points + 4));
+        points += 8;
+        __VOLK_PREFETCH(points);
  
-    points += 4;
+        xmm_result = _mm256_scaled_norm_dist_ps_avx2(
+            xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
  
-    xmm6 = _mm256_mul_ps(xmm4, xmm4);
+        _mm256_store_ps(target, xmm_result);
+        target += 8;
+    }
  
-    xmm4 = _mm256_hadd_ps(xmm6, xmm6);
-    xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+    if (num_bytes >> 5 & 1) {
+        xmm_points0 = _mm256_load_ps((float*)points);
  
-    xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
+        xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
  
-    xmm9 = _mm256_extractf128_ps(xmm_result, 1);
-    _mm_store_ps(target,xmm9);
-    target += 4;
-  }
+        points += 4;
  
-  if (num_bytes >> 4 & 1) {
-    xmm9 = _mm_load_ps((float*)points);
+        xmm6 = _mm256_mul_ps(xmm4, xmm4);
  
-    xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
+        xmm4 = _mm256_hadd_ps(xmm6, xmm6);
+        xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
  
-    points += 2;
+        xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
  
-    xmm9 = _mm_mul_ps(xmm10, xmm10);
+        xmm9 = _mm256_extractf128_ps(xmm_result, 1);
+        _mm_store_ps(target, xmm9);
+        target += 4;
+    }
  
-    xmm10 = _mm_hadd_ps(xmm9, xmm9);
+    if (num_bytes >> 4 & 1) {
+        xmm9 = _mm_load_ps((float*)points);
  
-    xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
+        xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
  
-    _mm_storeh_pi((__m64*)target, xmm10);
-    target += 2;
-  }
+        points += 2;
  
-  calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
+        xmm9 = _mm_mul_ps(xmm10, xmm10);
+
+        xmm10 = _mm_hadd_ps(xmm9, xmm9);
+
+        xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
+
+        _mm_storeh_pi((__m64*)target, xmm10);
+        target += 2;
+    }
+
+    calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
  }
  
  #endif /*LV_HAVE_AVX2*/
@@ -186,131 +190,139 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target, lv_32fc_t* s
  #include <volk/volk_avx_intrinsics.h>
  
  static inline void
-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float *target, lv_32fc_t *src0, 
-                                                    lv_32fc_t *points, float scalar, 
-                                                    unsigned int num_points) {
-  const int eightsPoints = num_points / 8;
-  const int remainder = num_points - 8 * eightsPoints;
-  
-  __m256 xmm_points0, xmm_points1, xmm_result;
-
-  // load complex value into all parts of the register.
-  const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
-  
-  // Load scalar into all 8 parts of the register
-  const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
-  
-  for(int i = 0; i < eightsPoints; ++i){
-    xmm_points0 = _mm256_load_ps((float*)points);
-    xmm_points1 = _mm256_load_ps((float*)(points + 4));
-    points += 8;
-    
-    xmm_result = _mm256_scaled_norm_dist_ps(xmm_symbol, xmm_symbol, xmm_points0, 
-                                            xmm_points1, xmm_scalar);
-    
-    _mm256_store_ps(target, xmm_result);
-    target += 8;
-  }
-  
-  const lv_32fc_t symbol = *src0;
-  calculate_scaled_distances(target, symbol, points, scalar, remainder);
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float* target,
+                                                    lv_32fc_t* src0,
+                                                    lv_32fc_t* points,
+                                                    float scalar,
+                                                    unsigned int num_points)
+{
+    const int eightsPoints = num_points / 8;
+    const int remainder = num_points - 8 * eightsPoints;
+
+    __m256 xmm_points0, xmm_points1, xmm_result;
+
+    // load complex value into all parts of the register.
+    const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
+
+    // Load scalar into all 8 parts of the register
+    const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
+
+    for (int i = 0; i < eightsPoints; ++i) {
+        xmm_points0 = _mm256_load_ps((float*)points);
+        xmm_points1 = _mm256_load_ps((float*)(points + 4));
+        points += 8;
+
+        xmm_result = _mm256_scaled_norm_dist_ps(
+            xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
+
+        _mm256_store_ps(target, xmm_result);
+        target += 8;
+    }
+
+    const lv_32fc_t symbol = *src0;
+    calculate_scaled_distances(target, symbol, points, scalar, remainder);
  }
  
  #endif /* LV_HAVE_AVX */
  
  
  #ifdef LV_HAVE_SSE3
-#include<pmmintrin.h>
-#include<volk/volk_sse3_intrinsics.h>
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
  
  static inline void
-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, lv_32fc_t* src0, 
-                                                     lv_32fc_t* points, float scalar, 
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target,
+                                                     lv_32fc_t* src0,
+                                                     lv_32fc_t* points,
+                                                     float scalar,
                                                       unsigned int num_points)
  {
-  __m128 xmm_points0, xmm_points1, xmm_result;
-
-  /*
-   * First do 4 values in every loop iteration.
-   * There may be up to 3 values left.
-   * leftovers0 indicates if at least 2 more are available for SSE execution.
-   * leftovers1 indicates if there is a single element left.
-   */
-  const int quarterPoints = num_points / 4;
-  const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
-  const int leftovers1 = num_points % 2;
-
-  // load complex value into both parts of the register.
-  const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
-  
-  // Load scalar into all 4 parts of the register
-  const __m128 xmm_scalar = _mm_load1_ps(&scalar);
-
-  for(int i = 0; i < quarterPoints; ++i) {
-    xmm_points0 = _mm_load_ps((float*)points);
-    xmm_points1 = _mm_load_ps((float*)(points + 2));
-    points += 4;
-    __VOLK_PREFETCH(points);
-    // calculate distances
-    xmm_result = _mm_scaled_norm_dist_ps_sse3(xmm_symbol, xmm_symbol, xmm_points0, 
-                                              xmm_points1, xmm_scalar);
-
-    _mm_store_ps(target, xmm_result);
-    target += 4;
-  }
-
-  for(int i = 0; i < leftovers0; ++i) {
-    xmm_points0 = _mm_load_ps((float*)points);
-    points += 2;
-    
-    xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
-    xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
-    xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
-    xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
-
-    _mm_storeh_pi((__m64*)target, xmm_result);
-    target += 2;
-  }
-
-  calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
+    __m128 xmm_points0, xmm_points1, xmm_result;
+
+    /*
+     * First do 4 values in every loop iteration.
+     * There may be up to 3 values left.
+     * leftovers0 indicates if at least 2 more are available for SSE execution.
+     * leftovers1 indicates if there is a single element left.
+     */
+    const int quarterPoints = num_points / 4;
+    const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
+    const int leftovers1 = num_points % 2;
+
+    // load complex value into both parts of the register.
+    const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
+
+    // Load scalar into all 4 parts of the register
+    const __m128 xmm_scalar = _mm_load1_ps(&scalar);
+
+    for (int i = 0; i < quarterPoints; ++i) {
+        xmm_points0 = _mm_load_ps((float*)points);
+        xmm_points1 = _mm_load_ps((float*)(points + 2));
+        points += 4;
+        __VOLK_PREFETCH(points);
+        // calculate distances
+        xmm_result = _mm_scaled_norm_dist_ps_sse3(
+            xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
+
+        _mm_store_ps(target, xmm_result);
+        target += 4;
+    }
+
+    for (int i = 0; i < leftovers0; ++i) {
+        xmm_points0 = _mm_load_ps((float*)points);
+        points += 2;
+
+        xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
+        xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
+        xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
+        xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
+
+        _mm_storeh_pi((__m64*)target, xmm_result);
+        target += 2;
+    }
+
+    calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
  }
  
  #endif /*LV_HAVE_SSE3*/
  
  #ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
  #include <volk/volk_sse_intrinsics.h>
+#include <xmmintrin.h>
  static inline void
-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse(float* target, lv_32fc_t* src0,
-                                                    lv_32fc_t* points, float scalar,
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse(float* target,
+                                                    lv_32fc_t* src0,
+                                                    lv_32fc_t* points,
+                                                    float scalar,
                                                      unsigned int num_points)
  {
-  const __m128 xmm_scalar = _mm_set1_ps(scalar);
-  const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
-
-  for (unsigned i = 0; i < num_points / 4; ++i) {
-    __m128 xmm_points0 = _mm_load_ps((float *) points);
-    __m128 xmm_points1 = _mm_load_ps((float *) (points + 2));
-    points += 4;
-    __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(xmm_symbol, xmm_symbol,
-                                                    xmm_points0, xmm_points1,
-                                                    xmm_scalar);
-    _mm_store_ps((float *) target, xmm_result);
-    target += 4;
-  }
-
-  calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
+    const __m128 xmm_scalar = _mm_set1_ps(scalar);
+    const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
+
+    for (unsigned i = 0; i < num_points / 4; ++i) {
+        __m128 xmm_points0 = _mm_load_ps((float*)points);
+        __m128 xmm_points1 = _mm_load_ps((float*)(points + 2));
+        points += 4;
+        __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(
+            xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
+        _mm_store_ps((float*)target, xmm_result);
+        target += 4;
+    }
+
+    calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
  }
  #endif // LV_HAVE_SSE
  
  #ifdef LV_HAVE_GENERIC
  static inline void
-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, lv_32fc_t* src0, 
-                                                      lv_32fc_t* points, float scalar, 
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target,
+                                                      lv_32fc_t* src0,
+                                                      lv_32fc_t* points,
+                                                      float scalar,
                                                        unsigned int num_points)
  {
-  const lv_32fc_t symbol = *src0;
-  calculate_scaled_distances(target, symbol, points, scalar, num_points);
+    const lv_32fc_t symbol = *src0;
+    calculate_scaled_distances(target, symbol, points, scalar, num_points);
  }
  
  #endif /*LV_HAVE_GENERIC*/
@@ -321,87 +333,88 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, lv_32fc_t*
  #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
  #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
  
-#include<volk/volk_complex.h>
+#include <volk/volk_complex.h>
  
  
  #ifdef LV_HAVE_AVX2
-#include<immintrin.h>
+#include <immintrin.h>
  #include <volk/volk_avx2_intrinsics.h>
  
  static inline void
-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target, lv_32fc_t* src0, 
-                                                     lv_32fc_t* points, float scalar, 
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target,
+                                                     lv_32fc_t* src0,
+                                                     lv_32fc_t* points,
+                                                     float scalar,
                                                       unsigned int num_points)
  {
-  const unsigned int num_bytes = num_points*8;
-  __m128 xmm9, xmm10;
-  __m256 xmm4, xmm6;
-  __m256 xmm_points0, xmm_points1, xmm_result;
+    const unsigned int num_bytes = num_points * 8;
+    __m128 xmm9, xmm10;
+    __m256 xmm4, xmm6;
+    __m256 xmm_points0, xmm_points1, xmm_result;
+
+    const unsigned int bound = num_bytes >> 6;
+
+    // load complex value into all parts of the register.
+    const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
+    const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
+
+    // Load scalar into all 8 parts of the register
+    const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
+    const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
  
-  const unsigned int bound = num_bytes >> 6;
-  
-  // load complex value into all parts of the register.
-  const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
-  const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
-  
-  // Load scalar into all 8 parts of the register
-  const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
-  const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
+    // Set permutation constant
+    const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
  
-  // Set permutation constant
-  const __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
-  
-  for(unsigned int i = 0; i < bound; ++i) {
-    xmm_points0 = _mm256_loadu_ps((float*)points);
-    xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
-    points += 8;
-    __VOLK_PREFETCH(points);
+    for (unsigned int i = 0; i < bound; ++i) {
+        xmm_points0 = _mm256_loadu_ps((float*)points);
+        xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
+        points += 8;
+        __VOLK_PREFETCH(points);
  
-    xmm_result = _mm256_scaled_norm_dist_ps_avx2(xmm_symbol, xmm_symbol,
-                                                 xmm_points0, xmm_points1, 
-                                                 xmm_scalar);
-    
-    _mm256_storeu_ps(target, xmm_result);
-    target += 8;
-  }
+        xmm_result = _mm256_scaled_norm_dist_ps_avx2(
+            xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
  
-  if (num_bytes >> 5 & 1) {
-    xmm_points0 = _mm256_loadu_ps((float*)points);
+        _mm256_storeu_ps(target, xmm_result);
+        target += 8;
+    }
  
-    xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
+    if (num_bytes >> 5 & 1) {
+        xmm_points0 = _mm256_loadu_ps((float*)points);
  
-    points += 4;
+        xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
  
-    xmm6 = _mm256_mul_ps(xmm4, xmm4);
+        points += 4;
  
-    xmm4 = _mm256_hadd_ps(xmm6, xmm6);
-    xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+        xmm6 = _mm256_mul_ps(xmm4, xmm4);
  
-    xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
+        xmm4 = _mm256_hadd_ps(xmm6, xmm6);
+        xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
  
-    xmm9 = _mm256_extractf128_ps(xmm_result, 1);
-    _mm_storeu_ps(target,xmm9);
-    target += 4;
-  }
+        xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
  
-  if (num_bytes >> 4 & 1) {
-    xmm9 = _mm_loadu_ps((float*)points);
+        xmm9 = _mm256_extractf128_ps(xmm_result, 1);
+        _mm_storeu_ps(target, xmm9);
+        target += 4;
+    }
  
-    xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
+    if (num_bytes >> 4 & 1) {
+        xmm9 = _mm_loadu_ps((float*)points);
  
-    points += 2;
+        xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
  
-    xmm9 = _mm_mul_ps(xmm10, xmm10);
+        points += 2;
  
-    xmm10 = _mm_hadd_ps(xmm9, xmm9);
+        xmm9 = _mm_mul_ps(xmm10, xmm10);
  
-    xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
+        xmm10 = _mm_hadd_ps(xmm9, xmm9);
  
-    _mm_storeh_pi((__m64*)target, xmm10);
-    target += 2;
-  }
+        xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
  
-  calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
+        _mm_storeh_pi((__m64*)target, xmm10);
+        target += 2;
+    }
+
+    calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
  }
  
  #endif /*LV_HAVE_AVX2*/
@@ -412,120 +425,126 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target, lv_32fc_t* s
  #include <volk/volk_avx_intrinsics.h>
  
  static inline void
-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx(float *target, lv_32fc_t *src0, 
-                                                    lv_32fc_t *points, float scalar, 
-                                                    unsigned int num_points) {
-  const int eightsPoints = num_points / 8;
-  const int remainder = num_points - 8 * eightsPoints;
-  
-  __m256 xmm_points0, xmm_points1, xmm_result;
-
-  // load complex value into all parts of the register.
-  const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
-  
-  // Load scalar into all 8 parts of the register
-  const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
-  
-  for(int i = 0; i < eightsPoints; ++i){
-    xmm_points0 = _mm256_loadu_ps((float*)points);
-    xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
-    points += 8;
-    
-    xmm_result = _mm256_scaled_norm_dist_ps(xmm_symbol, xmm_symbol, xmm_points0, 
-                                            xmm_points1, xmm_scalar);
-    
-    _mm256_storeu_ps(target, xmm_result);
-    target += 8;
-  }
-  
-  const lv_32fc_t symbol = *src0;
-  calculate_scaled_distances(target, symbol, points, scalar, remainder);
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx(float* target,
+                                                    lv_32fc_t* src0,
+                                                    lv_32fc_t* points,
+                                                    float scalar,
+                                                    unsigned int num_points)
+{
+    const int eightsPoints = num_points / 8;
+    const int remainder = num_points - 8 * eightsPoints;
+
+    __m256 xmm_points0, xmm_points1, xmm_result;
+
+    // load complex value into all parts of the register.
+    const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
+
+    // Load scalar into all 8 parts of the register
+    const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
+
+    for (int i = 0; i < eightsPoints; ++i) {
+        xmm_points0 = _mm256_loadu_ps((float*)points);
+        xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
+        points += 8;
+
+        xmm_result = _mm256_scaled_norm_dist_ps(
+            xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
+
+        _mm256_storeu_ps(target, xmm_result);
+        target += 8;
+    }
+
+    const lv_32fc_t symbol = *src0;
+    calculate_scaled_distances(target, symbol, points, scalar, remainder);
  }
  
  #endif /* LV_HAVE_AVX */
  
  
  #ifdef LV_HAVE_SSE3
-#include<pmmintrin.h>
-#include<volk/volk_sse3_intrinsics.h>
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
  
  static inline void
-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3(float* target, lv_32fc_t* src0, 
-                                                     lv_32fc_t* points, float scalar, 
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3(float* target,
+                                                     lv_32fc_t* src0,
+                                                     lv_32fc_t* points,
+                                                     float scalar,
                                                       unsigned int num_points)
  {
-  __m128 xmm_points0, xmm_points1, xmm_result;
-
-  /*
-   * First do 4 values in every loop iteration.
-   * There may be up to 3 values left.
-   * leftovers0 indicates if at least 2 more are available for SSE execution.
-   * leftovers1 indicates if there is a single element left.
-   */
-  const int quarterPoints = num_points / 4;
-  const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
-  const int leftovers1 = num_points % 2;
-
-  // load complex value into both parts of the register.
-  const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
-  
-  // Load scalar into all 4 parts of the register
-  const __m128 xmm_scalar = _mm_load1_ps(&scalar);
-  
-  for(int i = 0; i < quarterPoints; ++i) {
-    xmm_points0 = _mm_loadu_ps((float*)points);
-    xmm_points1 = _mm_loadu_ps((float*)(points + 2));
-    points += 4;
-    __VOLK_PREFETCH(points);
-    // calculate distances
-    xmm_result = _mm_scaled_norm_dist_ps_sse3(xmm_symbol, xmm_symbol, xmm_points0, 
-                                              xmm_points1, xmm_scalar);
-    
-    _mm_storeu_ps(target, xmm_result);
-    target += 4;
-  }
-
-  for(int i = 0; i < leftovers0; ++i) {
-    xmm_points0 = _mm_loadu_ps((float*)points);
-    points += 2;
-    
-    xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
-    xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
-    xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
-    xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
-
-    _mm_storeh_pi((__m64*)target, xmm_result);
-    target += 2;
-  }
-
-  calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
+    __m128 xmm_points0, xmm_points1, xmm_result;
+
+    /*
+     * First do 4 values in every loop iteration.
+     * There may be up to 3 values left.
+     * leftovers0 indicates if at least 2 more are available for SSE execution.
+     * leftovers1 indicates if there is a single element left.
+     */
+    const int quarterPoints = num_points / 4;
+    const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
+    const int leftovers1 = num_points % 2;
+
+    // load complex value into both parts of the register.
+    const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
+
+    // Load scalar into all 4 parts of the register
+    const __m128 xmm_scalar = _mm_load1_ps(&scalar);
+
+    for (int i = 0; i < quarterPoints; ++i) {
+        xmm_points0 = _mm_loadu_ps((float*)points);
+        xmm_points1 = _mm_loadu_ps((float*)(points + 2));
+        points += 4;
+        __VOLK_PREFETCH(points);
+        // calculate distances
+        xmm_result = _mm_scaled_norm_dist_ps_sse3(
+            xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
+
+        _mm_storeu_ps(target, xmm_result);
+        target += 4;
+    }
+
+    for (int i = 0; i < leftovers0; ++i) {
+        xmm_points0 = _mm_loadu_ps((float*)points);
+        points += 2;
+
+        xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
+        xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
+        xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
+        xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
+
+        _mm_storeh_pi((__m64*)target, xmm_result);
+        target += 2;
+    }
+
+    calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
  }
  
  #endif /*LV_HAVE_SSE3*/
  
  #ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
  #include <volk/volk_sse_intrinsics.h>
+#include <xmmintrin.h>
  static inline void
-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float* target, lv_32fc_t* src0,
-                                                    lv_32fc_t* points, float scalar,
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float* target,
+                                                    lv_32fc_t* src0,
+                                                    lv_32fc_t* points,
+                                                    float scalar,
                                                      unsigned int num_points)
  {
-  const __m128 xmm_scalar = _mm_set1_ps(scalar);
-  const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
-
-  for (unsigned i = 0; i < num_points / 4; ++i) {
-    __m128 xmm_points0 = _mm_loadu_ps((float *) points);
-    __m128 xmm_points1 = _mm_loadu_ps((float *) (points + 2));
-    points += 4;
-    __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(xmm_symbol, xmm_symbol,
-                                                    xmm_points0, xmm_points1,
-                                                    xmm_scalar);
-    _mm_storeu_ps((float *) target, xmm_result);
-    target += 4;
-  }
-
-  calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
+    const __m128 xmm_scalar = _mm_set1_ps(scalar);
+    const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
+
+    for (unsigned i = 0; i < num_points / 4; ++i) {
+        __m128 xmm_points0 = _mm_loadu_ps((float*)points);
+        __m128 xmm_points1 = _mm_loadu_ps((float*)(points + 2));
+        points += 4;
+        __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(
+            xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
+        _mm_storeu_ps((float*)target, xmm_result);
+        target += 4;
+    }
+
+    calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
  }
  #endif // LV_HAVE_SSE
  
diff --git a/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h

index 6c7f4d387eaee3da4f9fde33eeda6450e0c122e0..1fb9b6889b92aaa6488fab946065089b13fb9525 100644 (file)
--- a/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h
+++ b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h
@@ -32,14 +32,16 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points);
- * \endcode
+ * void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc(lv_32fc_t* cVector, const
+ * lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int
+ * num_points); \endcode
   *
   * \b Inputs
   * \li aVector: The input vector to be added.
   * \li bVector: The input vector to be conjugate and multiplied.
   * \li scalar: The complex scalar to multiply against conjugated bVector.
- * \li num_points: The number of complex values in aVector and bVector to be conjugate, multiplied and stored into cVector.
+ * \li num_points: The number of complex values in aVector and bVector to be conjugate,
+ * multiplied and stored into cVector.
   *
   * \b Outputs
   * \li cVector: The vector where the results will be stored.
@@ -84,15 +86,21 @@
  #ifndef INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
  #define INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
  
+#include <float.h>
  #include <inttypes.h>
  #include <stdio.h>
  #include <volk/volk_complex.h>
-#include <float.h>
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points){
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32fc_t* cVector,
+                                                       const lv_32fc_t* aVector,
+                                                       const lv_32fc_t* bVector,
+                                                       const lv_32fc_t scalar,
+                                                       unsigned int num_points)
+{
      const lv_32fc_t* aPtr = aVector;
      const lv_32fc_t* bPtr = bVector;
      lv_32fc_t* cPtr = cVector;
@@ -123,14 +131,20 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32f
  #include <immintrin.h>
  #include <volk/volk_avx_intrinsics.h>
  
-static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points) {
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t* cVector,
+                                                     const lv_32fc_t* aVector,
+                                                     const lv_32fc_t* bVector,
+                                                     const lv_32fc_t scalar,
+                                                     unsigned int num_points)
+{
      unsigned int number = 0;
      unsigned int i = 0;
      const unsigned int quarterPoints = num_points / 4;
      unsigned int isodd = num_points & 3;
  
      __m256 x, y, s, z;
-    lv_32fc_t v_scalar[4] = {scalar, scalar, scalar, scalar};
+    lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
  
      const lv_32fc_t* a = aVector;
      const lv_32fc_t* b = bVector;
@@ -139,19 +153,19 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_
      // Set up constant scalar vector
      s = _mm256_loadu_ps((float*)v_scalar);
  
-    for(;number < quarterPoints; number++) {
+    for (; number < quarterPoints; number++) {
          x = _mm256_loadu_ps((float*)b);
          y = _mm256_loadu_ps((float*)a);
          z = _mm256_complexconjugatemul_ps(s, x);
          z = _mm256_add_ps(y, z);
-        _mm256_storeu_ps((float*)c,z);
+        _mm256_storeu_ps((float*)c, z);
  
          a += 4;
          b += 4;
          c += 4;
      }
  
-    for(i = num_points-isodd; i < num_points; i++) {
+    for (i = num_points - isodd; i < num_points; i++) {
          *c++ = (*a++) + lv_conj(*b++) * scalar;
      }
  }
@@ -162,12 +176,18 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_
  #include <pmmintrin.h>
  #include <volk/volk_sse3_intrinsics.h>
  
-static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points) {
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t* cVector,
+                                                      const lv_32fc_t* aVector,
+                                                      const lv_32fc_t* bVector,
+                                                      const lv_32fc_t scalar,
+                                                      unsigned int num_points)
+{
      unsigned int number = 0;
      const unsigned int halfPoints = num_points / 2;
  
      __m128 x, y, s, z;
-    lv_32fc_t v_scalar[2] = {scalar, scalar};
+    lv_32fc_t v_scalar[2] = { scalar, scalar };
  
      const lv_32fc_t* a = aVector;
      const lv_32fc_t* b = bVector;
@@ -176,19 +196,19 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc
      // Set up constant scalar vector
      s = _mm_loadu_ps((float*)v_scalar);
  
-    for(;number < halfPoints; number++){
+    for (; number < halfPoints; number++) {
          x = _mm_loadu_ps((float*)b);
          y = _mm_loadu_ps((float*)a);
          z = _mm_complexconjugatemul_ps(s, x);
          z = _mm_add_ps(y, z);
-        _mm_storeu_ps((float*)c,z);
+        _mm_storeu_ps((float*)c, z);
  
          a += 2;
          b += 2;
          c += 2;
      }
  
-    if((num_points % 2) != 0) {
+    if ((num_points % 2) != 0) {
          *c = *a + lv_conj(*b) * scalar;
      }
  }
@@ -199,14 +219,20 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc
  #include <immintrin.h>
  #include <volk/volk_avx_intrinsics.h>
  
-static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points) {
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t* cVector,
+                                                     const lv_32fc_t* aVector,
+                                                     const lv_32fc_t* bVector,
+                                                     const lv_32fc_t scalar,
+                                                     unsigned int num_points)
+{
      unsigned int number = 0;
      unsigned int i = 0;
      const unsigned int quarterPoints = num_points / 4;
      unsigned int isodd = num_points & 3;
  
      __m256 x, y, s, z;
-    lv_32fc_t v_scalar[4] = {scalar, scalar, scalar, scalar};
+    lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
  
      const lv_32fc_t* a = aVector;
      const lv_32fc_t* b = bVector;
@@ -215,19 +241,19 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_
      // Set up constant scalar vector
      s = _mm256_load_ps((float*)v_scalar);
  
-    for(;number < quarterPoints; number++) {
+    for (; number < quarterPoints; number++) {
          x = _mm256_load_ps((float*)b);
          y = _mm256_load_ps((float*)a);
          z = _mm256_complexconjugatemul_ps(s, x);
          z = _mm256_add_ps(y, z);
-        _mm256_store_ps((float*)c,z);
+        _mm256_store_ps((float*)c, z);
  
          a += 4;
          b += 4;
          c += 4;
      }
  
-    for(i = num_points-isodd; i < num_points; i++) {
+    for (i = num_points - isodd; i < num_points; i++) {
          *c++ = (*a++) + lv_conj(*b++) * scalar;
      }
  }
@@ -238,12 +264,18 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_
  #include <pmmintrin.h>
  #include <volk/volk_sse3_intrinsics.h>
  
-static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points) {
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t* cVector,
+                                                      const lv_32fc_t* aVector,
+                                                      const lv_32fc_t* bVector,
+                                                      const lv_32fc_t scalar,
+                                                      unsigned int num_points)
+{
      unsigned int number = 0;
      const unsigned int halfPoints = num_points / 2;
  
      __m128 x, y, s, z;
-    lv_32fc_t v_scalar[2] = {scalar, scalar};
+    lv_32fc_t v_scalar[2] = { scalar, scalar };
  
      const lv_32fc_t* a = aVector;
      const lv_32fc_t* b = bVector;
@@ -252,19 +284,19 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc
      // Set up constant scalar vector
      s = _mm_load_ps((float*)v_scalar);
  
-    for(;number < halfPoints; number++){
+    for (; number < halfPoints; number++) {
          x = _mm_load_ps((float*)b);
          y = _mm_load_ps((float*)a);
          z = _mm_complexconjugatemul_ps(s, x);
          z = _mm_add_ps(y, z);
-        _mm_store_ps((float*)c,z);
+        _mm_store_ps((float*)c, z);
  
          a += 2;
          b += 2;
          c += 2;
      }
  
-    if((num_points % 2) != 0) {
+    if ((num_points % 2) != 0) {
          *c = *a + lv_conj(*b) * scalar;
      }
  }
@@ -272,9 +304,15 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc
  
  
  #ifdef LV_HAVE_NEON
-#include  <arm_neon.h>
-
-static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points){
+#include <arm_neon.h>
+
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t* cVector,
+                                                    const lv_32fc_t* aVector,
+                                                    const lv_32fc_t* bVector,
+                                                    const lv_32fc_t scalar,
+                                                    unsigned int num_points)
+{
      const lv_32fc_t* bPtr = bVector;
      const lv_32fc_t* aPtr = aVector;
      lv_32fc_t* cPtr = cVector;
@@ -287,7 +325,7 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t
      scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar);
      scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1);
  
-    for(number = 0; number < quarter_points; ++number) {
+    for (number = 0; number < quarter_points; ++number) {
          a_val = vld2q_f32((float*)aPtr);
          b_val = vld2q_f32((float*)bPtr);
          b_val.val[1] = vnegq_f32(b_val.val[1]);
@@ -310,7 +348,7 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t
          cPtr += 4;
      }
  
-    for(number = quarter_points*4; number < num_points; number++){
+    for (number = quarter_points * 4; number < num_points; number++) {
          *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
      }
  }
diff --git a/kernels/volk/volk_32fc_x2_square_dist_32f.h b/kernels/volk/volk_32fc_x2_square_dist_32f.h

index d6c6dffc8f342235bf5e197e42e42810655273bd..75f40720ec3568af13cabecb665cc1cd573e3772 100644 (file)
--- a/kernels/volk/volk_32fc_x2_square_dist_32f.h
+++ b/kernels/volk/volk_32fc_x2_square_dist_32f.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32fc_x2_square_dist_32f(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) {
- * \endcode
+ * void volk_32fc_x2_square_dist_32f(float* target, lv_32fc_t* src0, lv_32fc_t* points,
+ * unsigned int num_points) { \endcode
   *
   * \b Inputs
   * \li src0: The complex input. Only the first point is used.
@@ -78,183 +78,185 @@
  #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
  #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
  
-#include<inttypes.h>
-#include<stdio.h>
-#include<volk/volk_complex.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
  
  #ifdef LV_HAVE_AVX2
-#include<immintrin.h>
+#include <immintrin.h>
  
-static inline void
-volk_32fc_x2_square_dist_32f_a_avx2(float* target, lv_32fc_t* src0, lv_32fc_t* points,
-                                    unsigned int num_points)
+static inline void volk_32fc_x2_square_dist_32f_a_avx2(float* target,
+                                                       lv_32fc_t* src0,
+                                                       lv_32fc_t* points,
+                                                       unsigned int num_points)
  {
-  const unsigned int num_bytes = num_points*8;
-  __m128 xmm0, xmm9, xmm10;
-  __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-
-  lv_32fc_t diff;
-  float sq_dist;
-  int bound = num_bytes >> 6;
-  int leftovers0 = (num_bytes >> 5) & 1;
-  int leftovers1 = (num_bytes >> 4) & 1;
-  int leftovers2 = (num_bytes >> 3) & 1;
-  int i = 0;
-
-  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
-  xmm1 = _mm256_setzero_ps();
-  xmm2 = _mm256_load_ps((float*)&points[0]);
-  xmm0 = _mm_load_ps((float*)src0);
-  xmm0 = _mm_permute_ps(xmm0, 0b01000100);
-  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
-  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
-  xmm3 = _mm256_load_ps((float*)&points[4]);
-
-  for(; i < bound; ++i) {
-    xmm4 = _mm256_sub_ps(xmm1, xmm2);
-    xmm5 = _mm256_sub_ps(xmm1, xmm3);
-    points += 8;
-    xmm6 = _mm256_mul_ps(xmm4, xmm4);
-    xmm7 = _mm256_mul_ps(xmm5, xmm5);
-
+    const unsigned int num_bytes = num_points * 8;
+    __m128 xmm0, xmm9, xmm10;
+    __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+    lv_32fc_t diff;
+    float sq_dist;
+    int bound = num_bytes >> 6;
+    int leftovers0 = (num_bytes >> 5) & 1;
+    int leftovers1 = (num_bytes >> 4) & 1;
+    int leftovers2 = (num_bytes >> 3) & 1;
+    int i = 0;
+
+    __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+    xmm1 = _mm256_setzero_ps();
      xmm2 = _mm256_load_ps((float*)&points[0]);
+    xmm0 = _mm_load_ps((float*)src0);
+    xmm0 = _mm_permute_ps(xmm0, 0b01000100);
+    xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
+    xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
+    xmm3 = _mm256_load_ps((float*)&points[4]);
  
-    xmm4 = _mm256_hadd_ps(xmm6, xmm7);
-    xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+    for (; i < bound; ++i) {
+        xmm4 = _mm256_sub_ps(xmm1, xmm2);
+        xmm5 = _mm256_sub_ps(xmm1, xmm3);
+        points += 8;
+        xmm6 = _mm256_mul_ps(xmm4, xmm4);
+        xmm7 = _mm256_mul_ps(xmm5, xmm5);
  
-    xmm3 = _mm256_load_ps((float*)&points[4]);
+        xmm2 = _mm256_load_ps((float*)&points[0]);
  
-    _mm256_store_ps(target, xmm4);
+        xmm4 = _mm256_hadd_ps(xmm6, xmm7);
+        xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
  
-    target += 8;
-  }
+        xmm3 = _mm256_load_ps((float*)&points[4]);
  
-  for(i = 0; i < leftovers0; ++i) {
+        _mm256_store_ps(target, xmm4);
  
-    xmm2 = _mm256_load_ps((float*)&points[0]);
+        target += 8;
+    }
  
-    xmm4 = _mm256_sub_ps(xmm1, xmm2);
+    for (i = 0; i < leftovers0; ++i) {
  
-    points += 4;
+        xmm2 = _mm256_load_ps((float*)&points[0]);
  
-    xmm6 = _mm256_mul_ps(xmm4, xmm4);
+        xmm4 = _mm256_sub_ps(xmm1, xmm2);
  
-    xmm4 = _mm256_hadd_ps(xmm6, xmm6);
-    xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+        points += 4;
  
-    xmm9 = _mm256_extractf128_ps(xmm4, 1);
-    _mm_store_ps(target,xmm9);
+        xmm6 = _mm256_mul_ps(xmm4, xmm4);
  
-    target += 4;
-  }
+        xmm4 = _mm256_hadd_ps(xmm6, xmm6);
+        xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+
+        xmm9 = _mm256_extractf128_ps(xmm4, 1);
+        _mm_store_ps(target, xmm9);
  
-  for(i = 0; i < leftovers1; ++i) {
-    xmm9 = _mm_load_ps((float*)&points[0]);
+        target += 4;
+    }
  
-    xmm10 = _mm_sub_ps(xmm0, xmm9);
+    for (i = 0; i < leftovers1; ++i) {
+        xmm9 = _mm_load_ps((float*)&points[0]);
  
-    points += 2;
+        xmm10 = _mm_sub_ps(xmm0, xmm9);
  
-    xmm9 = _mm_mul_ps(xmm10, xmm10);
+        points += 2;
  
-    xmm10 = _mm_hadd_ps(xmm9, xmm9);
+        xmm9 = _mm_mul_ps(xmm10, xmm10);
  
-    _mm_storeh_pi((__m64*)target, xmm10);
+        xmm10 = _mm_hadd_ps(xmm9, xmm9);
  
-    target += 2;
-  }
+        _mm_storeh_pi((__m64*)target, xmm10);
  
-  for(i = 0; i < leftovers2; ++i) {
+        target += 2;
+    }
  
-    diff = src0[0] - points[0];
+    for (i = 0; i < leftovers2; ++i) {
  
-    sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+        diff = src0[0] - points[0];
  
-    target[0] = sq_dist;
-  }
+        sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+
+        target[0] = sq_dist;
+    }
  }
  
  #endif /*LV_HAVE_AVX2*/
  
  #ifdef LV_HAVE_SSE3
-#include<xmmintrin.h>
-#include<pmmintrin.h>
+#include <pmmintrin.h>
+#include <xmmintrin.h>
  
-static inline void
-volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points,
-                                    unsigned int num_points)
+static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target,
+                                                       lv_32fc_t* src0,
+                                                       lv_32fc_t* points,
+                                                       unsigned int num_points)
  {
-  const unsigned int num_bytes = num_points*8;
+    const unsigned int num_bytes = num_points * 8;
  
-  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+    __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
  
-  lv_32fc_t diff;
-  float sq_dist;
-  int bound = num_bytes >> 5;
-  int i = 0;
+    lv_32fc_t diff;
+    float sq_dist;
+    int bound = num_bytes >> 5;
+    int i = 0;
  
-  xmm1 = _mm_setzero_ps();
-  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
-  xmm2 = _mm_load_ps((float*)&points[0]);
-  xmm1 = _mm_movelh_ps(xmm1, xmm1);
-  xmm3 = _mm_load_ps((float*)&points[2]);
+    xmm1 = _mm_setzero_ps();
+    xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
+    xmm2 = _mm_load_ps((float*)&points[0]);
+    xmm1 = _mm_movelh_ps(xmm1, xmm1);
+    xmm3 = _mm_load_ps((float*)&points[2]);
+
+    for (; i < bound - 1; ++i) {
+        xmm4 = _mm_sub_ps(xmm1, xmm2);
+        xmm5 = _mm_sub_ps(xmm1, xmm3);
+        points += 4;
+        xmm6 = _mm_mul_ps(xmm4, xmm4);
+        xmm7 = _mm_mul_ps(xmm5, xmm5);
+
+        xmm2 = _mm_load_ps((float*)&points[0]);
+
+        xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+        xmm3 = _mm_load_ps((float*)&points[2]);
+
+        _mm_store_ps(target, xmm4);
+
+        target += 4;
+    }
  
-  for(; i < bound - 1; ++i) {
      xmm4 = _mm_sub_ps(xmm1, xmm2);
      xmm5 = _mm_sub_ps(xmm1, xmm3);
+
      points += 4;
      xmm6 = _mm_mul_ps(xmm4, xmm4);
      xmm7 = _mm_mul_ps(xmm5, xmm5);
  
-    xmm2 = _mm_load_ps((float*)&points[0]);
-
      xmm4 = _mm_hadd_ps(xmm6, xmm7);
  
-    xmm3 = _mm_load_ps((float*)&points[2]);
-
      _mm_store_ps(target, xmm4);
  
      target += 4;
-  }
-
-  xmm4 = _mm_sub_ps(xmm1, xmm2);
-  xmm5 = _mm_sub_ps(xmm1, xmm3);
-
-  points += 4;
-  xmm6 = _mm_mul_ps(xmm4, xmm4);
-  xmm7 = _mm_mul_ps(xmm5, xmm5);
  
-  xmm4 = _mm_hadd_ps(xmm6, xmm7);
+    if (num_bytes >> 4 & 1) {
  
-  _mm_store_ps(target, xmm4);
+        xmm2 = _mm_load_ps((float*)&points[0]);
  
-  target += 4;
+        xmm4 = _mm_sub_ps(xmm1, xmm2);
  
-  if (num_bytes >> 4 & 1) {
+        points += 2;
  
-    xmm2 = _mm_load_ps((float*)&points[0]);
-
-    xmm4 = _mm_sub_ps(xmm1, xmm2);
+        xmm6 = _mm_mul_ps(xmm4, xmm4);
  
-    points += 2;
-
-    xmm6 = _mm_mul_ps(xmm4, xmm4);
+        xmm4 = _mm_hadd_ps(xmm6, xmm6);
  
-    xmm4 = _mm_hadd_ps(xmm6, xmm6);
+        _mm_storeh_pi((__m64*)target, xmm4);
  
-    _mm_storeh_pi((__m64*)target, xmm4);
+        target += 2;
+    }
  
-    target += 2;
-  }
+    if (num_bytes >> 3 & 1) {
  
-  if (num_bytes >> 3 & 1) {
+        diff = src0[0] - points[0];
  
-    diff = src0[0] - points[0];
+        sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
  
-    sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
-
-    target[0] = sq_dist;
-  }
+        target[0] = sq_dist;
+    }
  }
  
  #endif /*LV_HAVE_SSE3*/
@@ -262,55 +264,58 @@ volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* p
  
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
-static inline void
-volk_32fc_x2_square_dist_32f_neon(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points)
+static inline void volk_32fc_x2_square_dist_32f_neon(float* target,
+                                                     lv_32fc_t* src0,
+                                                     lv_32fc_t* points,
+                                                     unsigned int num_points)
  {
-  const unsigned int quarter_points = num_points / 4;
-  unsigned int number;
-
-  float32x4x2_t a_vec, b_vec;
-  float32x4x2_t diff_vec;
-  float32x4_t tmp, tmp1, dist_sq;
-  a_vec.val[0] = vdupq_n_f32( lv_creal(src0[0]) );
-  a_vec.val[1] = vdupq_n_f32( lv_cimag(src0[0]) );
-  for(number=0; number < quarter_points; ++number) {
-    b_vec = vld2q_f32((float*)points);
-    diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
-    diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
-    tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
-    tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
-
-    dist_sq = vaddq_f32(tmp, tmp1);
-    vst1q_f32(target, dist_sq);
-    points += 4;
-    target += 4;
-  }
-  for(number=quarter_points*4; number < num_points; ++number) {
-    lv_32fc_t diff = src0[0] - *points++;
-    *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
-  }
+    const unsigned int quarter_points = num_points / 4;
+    unsigned int number;
+
+    float32x4x2_t a_vec, b_vec;
+    float32x4x2_t diff_vec;
+    float32x4_t tmp, tmp1, dist_sq;
+    a_vec.val[0] = vdupq_n_f32(lv_creal(src0[0]));
+    a_vec.val[1] = vdupq_n_f32(lv_cimag(src0[0]));
+    for (number = 0; number < quarter_points; ++number) {
+        b_vec = vld2q_f32((float*)points);
+        diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
+        diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
+        tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
+        tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
+
+        dist_sq = vaddq_f32(tmp, tmp1);
+        vst1q_f32(target, dist_sq);
+        points += 4;
+        target += 4;
+    }
+    for (number = quarter_points * 4; number < num_points; ++number) {
+        lv_32fc_t diff = src0[0] - *points++;
+        *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+    }
  }
  #endif /* LV_HAVE_NEON */
  
  
  #ifdef LV_HAVE_GENERIC
-static inline void
-volk_32fc_x2_square_dist_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points,
-                                     unsigned int num_points)
+static inline void volk_32fc_x2_square_dist_32f_generic(float* target,
+                                                        lv_32fc_t* src0,
+                                                        lv_32fc_t* points,
+                                                        unsigned int num_points)
  {
-  const unsigned int num_bytes = num_points*8;
+    const unsigned int num_bytes = num_points * 8;
  
-  lv_32fc_t diff;
-  float sq_dist;
-  unsigned int i = 0;
+    lv_32fc_t diff;
+    float sq_dist;
+    unsigned int i = 0;
  
-  for(; i < num_bytes >> 3; ++i) {
-    diff = src0[0] - points[i];
+    for (; i<num_bytes>> 3; ++i) {
+        diff = src0[0] - points[i];
  
-    sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+        sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
  
-    target[i] = sq_dist;
-  }
+        target[i] = sq_dist;
+    }
  }
  
  #endif /*LV_HAVE_GENERIC*/
@@ -321,80 +326,85 @@ volk_32fc_x2_square_dist_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t*
  #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_u_H
  #define INCLUDED_volk_32fc_x2_square_dist_32f_u_H
  
-#include<inttypes.h>
-#include<stdio.h>
-#include<volk/volk_complex.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
  
  #ifdef LV_HAVE_AVX2
-#include<immintrin.h>
+#include <immintrin.h>
  
-static inline void
-volk_32fc_x2_square_dist_32f_u_avx2(float* target, lv_32fc_t* src0, lv_32fc_t* points,
-                                    unsigned int num_points)
+static inline void volk_32fc_x2_square_dist_32f_u_avx2(float* target,
+                                                       lv_32fc_t* src0,
+                                                       lv_32fc_t* points,
+                                                       unsigned int num_points)
  {
-  const unsigned int num_bytes = num_points*8;
-  __m128 xmm0, xmm9;
-  __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-
-  lv_32fc_t diff;
-  float sq_dist;
-  int bound = num_bytes >> 6;
-  int leftovers1 = (num_bytes >> 3) & 0b11;
-  int i = 0;
-
-  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
-  xmm1 = _mm256_setzero_ps();
-  xmm0 = _mm_loadu_ps((float*)src0);
-  xmm0 = _mm_permute_ps(xmm0, 0b01000100);
-  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
-  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
-
-  for(; i < bound; ++i) {
+    const unsigned int num_bytes = num_points * 8;
+    __m128 xmm0, xmm9;
+    __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+    lv_32fc_t diff;
+    float sq_dist;
+    int bound = num_bytes >> 6;
+    int leftovers1 = (num_bytes >> 3) & 0b11;
+    int i = 0;
+
+    __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+    xmm1 = _mm256_setzero_ps();
      xmm2 = _mm256_loadu_ps((float*)&points[0]);
+    xmm0 = _mm_loadu_ps((float*)src0);
+    xmm0 = _mm_permute_ps(xmm0, 0b01000100);
+    xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
+    xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
      xmm3 = _mm256_loadu_ps((float*)&points[4]);
-    xmm4 = _mm256_sub_ps(xmm1, xmm2);
-    xmm5 = _mm256_sub_ps(xmm1, xmm3);
-    points += 8;
-    xmm6 = _mm256_mul_ps(xmm4, xmm4);
-    xmm7 = _mm256_mul_ps(xmm5, xmm5);
  
-    xmm4 = _mm256_hadd_ps(xmm6, xmm7);
-    xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+    for (; i < bound; ++i) {
+        xmm4 = _mm256_sub_ps(xmm1, xmm2);
+        xmm5 = _mm256_sub_ps(xmm1, xmm3);
+        points += 8;
+        xmm6 = _mm256_mul_ps(xmm4, xmm4);
+        xmm7 = _mm256_mul_ps(xmm5, xmm5);
  
-    _mm256_storeu_ps(target, xmm4);
+        xmm2 = _mm256_loadu_ps((float*)&points[0]);
  
-    target += 8;
-  }
+        xmm4 = _mm256_hadd_ps(xmm6, xmm7);
+        xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
  
-  if (num_bytes >> 5 & 1) {
+        xmm3 = _mm256_loadu_ps((float*)&points[4]);
  
-    xmm2 = _mm256_loadu_ps((float*)&points[0]);
+        _mm256_storeu_ps(target, xmm4);
  
-    xmm4 = _mm256_sub_ps(xmm1, xmm2);
+        target += 8;
+    }
  
-    points += 4;
+    if (num_bytes >> 5 & 1) {
  
-    xmm6 = _mm256_mul_ps(xmm4, xmm4);
+        xmm2 = _mm256_loadu_ps((float*)&points[0]);
  
-    xmm4 = _mm256_hadd_ps(xmm6, xmm6);
-    xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+        xmm4 = _mm256_sub_ps(xmm1, xmm2);
  
-    xmm9 = _mm256_extractf128_ps(xmm4, 1);
-    _mm_storeu_ps(target,xmm9);
+        points += 4;
  
-    target += 4;
-  }
+        xmm6 = _mm256_mul_ps(xmm4, xmm4);
+
+        xmm4 = _mm256_hadd_ps(xmm6, xmm6);
+        xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+
+        xmm9 = _mm256_extractf128_ps(xmm4, 1);
+        _mm_storeu_ps(target, xmm9);
+
+        target += 4;
+    }
  
-  for(i = 0; i < leftovers1; ++i) {
+    for (i = 0; i < leftovers1; ++i) {
  
-    diff = src0[0] - points[0];
-    points += 1;
+        diff = src0[0] - points[0];
+        points += 1;
  
-    sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+        sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
  
-    target[0] = sq_dist;
-    target += 1;
-  }
+        target[0] = sq_dist;
+        target += 1;
+    }
  }
  
  #endif /*LV_HAVE_AVX2*/
diff --git a/kernels/volk/volk_32i_s32f_convert_32f.h b/kernels/volk/volk_32i_s32f_convert_32f.h

index 87d94f9bdb47a84865b2b0b759ac7e5e9c7f1623..6b67cdbde2060bf13c5684ce052025420a821926 100644 (file)
--- a/kernels/volk/volk_32i_s32f_convert_32f.h
+++ b/kernels/volk/volk_32i_s32f_convert_32f.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32i_s32f_convert_32f(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points)
- * \endcode
+ * void volk_32i_s32f_convert_32f(float* outputVector, const int32_t* inputVector, const
+ * float scalar, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li inputVector: The vector of 32-bit integers.
@@ -70,37 +70,38 @@
  #ifdef LV_HAVE_AVX512F
  #include <immintrin.h>
  
-static inline void
-volk_32i_s32f_convert_32f_u_avx512f(float* outputVector, const int32_t* inputVector,
-                                 const float scalar, unsigned int num_points)
+static inline void volk_32i_s32f_convert_32f_u_avx512f(float* outputVector,
+                                                       const int32_t* inputVector,
+                                                       const float scalar,
+                                                       unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int onesixteenthPoints = num_points / 16;
+    unsigned int number = 0;
+    const unsigned int onesixteenthPoints = num_points / 16;
  
-  float* outputVectorPtr = outputVector;
-  const float iScalar = 1.0 / scalar;
-  __m512 invScalar = _mm512_set1_ps(iScalar);
-  int32_t* inputPtr = (int32_t*)inputVector;
-  __m512i inputVal;
-  __m512 ret;
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m512 invScalar = _mm512_set1_ps(iScalar);
+    int32_t* inputPtr = (int32_t*)inputVector;
+    __m512i inputVal;
+    __m512 ret;
  
-  for(;number < onesixteenthPoints; number++){
-    // Load the values
-    inputVal = _mm512_loadu_si512((__m512i*)inputPtr);
+    for (; number < onesixteenthPoints; number++) {
+        // Load the values
+        inputVal = _mm512_loadu_si512((__m512i*)inputPtr);
  
-    ret = _mm512_cvtepi32_ps(inputVal);
-    ret = _mm512_mul_ps(ret, invScalar);
+        ret = _mm512_cvtepi32_ps(inputVal);
+        ret = _mm512_mul_ps(ret, invScalar);
  
-    _mm512_storeu_ps(outputVectorPtr, ret);
+        _mm512_storeu_ps(outputVectorPtr, ret);
  
-    outputVectorPtr += 16;
-    inputPtr += 16;
-  }
+        outputVectorPtr += 16;
+        inputPtr += 16;
+    }
  
-  number = onesixteenthPoints * 16;
-  for(; number < num_points; number++){
-    outputVector[number] =((float)(inputVector[number])) * iScalar;
-  }
+    number = onesixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = ((float)(inputVector[number])) * iScalar;
+    }
  }
  #endif /* LV_HAVE_AVX512F */
  
@@ -108,37 +109,38 @@ volk_32i_s32f_convert_32f_u_avx512f(float* outputVector, const int32_t* inputVec
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_32i_s32f_convert_32f_u_avx2(float* outputVector, const int32_t* inputVector,
-                                 const float scalar, unsigned int num_points)
+static inline void volk_32i_s32f_convert_32f_u_avx2(float* outputVector,
+                                                    const int32_t* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int oneEightPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int oneEightPoints = num_points / 8;
  
-  float* outputVectorPtr = outputVector;
-  const float iScalar = 1.0 / scalar;
-  __m256 invScalar = _mm256_set1_ps(iScalar);
-  int32_t* inputPtr = (int32_t*)inputVector;
-  __m256i inputVal;
-  __m256 ret;
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m256 invScalar = _mm256_set1_ps(iScalar);
+    int32_t* inputPtr = (int32_t*)inputVector;
+    __m256i inputVal;
+    __m256 ret;
  
-  for(;number < oneEightPoints; number++){
-    // Load the 4 values
-    inputVal = _mm256_loadu_si256((__m256i*)inputPtr);
+    for (; number < oneEightPoints; number++) {
+        // Load the 4 values
+        inputVal = _mm256_loadu_si256((__m256i*)inputPtr);
  
-    ret = _mm256_cvtepi32_ps(inputVal);
-    ret = _mm256_mul_ps(ret, invScalar);
+        ret = _mm256_cvtepi32_ps(inputVal);
+        ret = _mm256_mul_ps(ret, invScalar);
  
-    _mm256_storeu_ps(outputVectorPtr, ret);
+        _mm256_storeu_ps(outputVectorPtr, ret);
  
-    outputVectorPtr += 8;
-    inputPtr += 8;
-  }
+        outputVectorPtr += 8;
+        inputPtr += 8;
+    }
  
-  number = oneEightPoints * 8;
-  for(; number < num_points; number++){
-    outputVector[number] =((float)(inputVector[number])) * iScalar;
-  }
+    number = oneEightPoints * 8;
+    for (; number < num_points; number++) {
+        outputVector[number] = ((float)(inputVector[number])) * iScalar;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -146,62 +148,63 @@ volk_32i_s32f_convert_32f_u_avx2(float* outputVector, const int32_t* inputVector
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void
-volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const int32_t* inputVector,
-                                 const float scalar, unsigned int num_points)
+static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector,
+                                                    const int32_t* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  float* outputVectorPtr = outputVector;
-  const float iScalar = 1.0 / scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-  int32_t* inputPtr = (int32_t*)inputVector;
-  __m128i inputVal;
-  __m128 ret;
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    int32_t* inputPtr = (int32_t*)inputVector;
+    __m128i inputVal;
+    __m128 ret;
  
-  for(;number < quarterPoints; number++){
-    // Load the 4 values
-    inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+    for (; number < quarterPoints; number++) {
+        // Load the 4 values
+        inputVal = _mm_loadu_si128((__m128i*)inputPtr);
  
-    ret = _mm_cvtepi32_ps(inputVal);
-    ret = _mm_mul_ps(ret, invScalar);
+        ret = _mm_cvtepi32_ps(inputVal);
+        ret = _mm_mul_ps(ret, invScalar);
  
-    _mm_storeu_ps(outputVectorPtr, ret);
+        _mm_storeu_ps(outputVectorPtr, ret);
  
-    outputVectorPtr += 4;
-    inputPtr += 4;
-  }
+        outputVectorPtr += 4;
+        inputPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    outputVector[number] =((float)(inputVector[number])) * iScalar;
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        outputVector[number] = ((float)(inputVector[number])) * iScalar;
+    }
  }
  #endif /* LV_HAVE_SSE2 */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32i_s32f_convert_32f_generic(float* outputVector, const int32_t* inputVector,
-                                  const float scalar, unsigned int num_points)
+static inline void volk_32i_s32f_convert_32f_generic(float* outputVector,
+                                                     const int32_t* inputVector,
+                                                     const float scalar,
+                                                     unsigned int num_points)
  {
-  float* outputVectorPtr = outputVector;
-  const int32_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  const float iScalar = 1.0 / scalar;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
-  }
+    float* outputVectorPtr = outputVector;
+    const int32_t* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+    const float iScalar = 1.0 / scalar;
+
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  #endif /* INCLUDED_volk_32i_s32f_convert_32f_u_H */
  
  
-
  #ifndef INCLUDED_volk_32i_s32f_convert_32f_a_H
  #define INCLUDED_volk_32i_s32f_convert_32f_a_H
  
@@ -211,74 +214,76 @@ volk_32i_s32f_convert_32f_generic(float* outputVector, const int32_t* inputVecto
  #ifdef LV_HAVE_AVX512F
  #include <immintrin.h>
  
-static inline void
-volk_32i_s32f_convert_32f_a_avx512f(float* outputVector, const int32_t* inputVector,
-                                 const float scalar, unsigned int num_points)
+static inline void volk_32i_s32f_convert_32f_a_avx512f(float* outputVector,
+                                                       const int32_t* inputVector,
+                                                       const float scalar,
+                                                       unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int onesixteenthPoints = num_points / 16;
+    unsigned int number = 0;
+    const unsigned int onesixteenthPoints = num_points / 16;
  
-  float* outputVectorPtr = outputVector;
-  const float iScalar = 1.0 / scalar;
-  __m512 invScalar = _mm512_set1_ps(iScalar);
-  int32_t* inputPtr = (int32_t*)inputVector;
-  __m512i inputVal;
-  __m512 ret;
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m512 invScalar = _mm512_set1_ps(iScalar);
+    int32_t* inputPtr = (int32_t*)inputVector;
+    __m512i inputVal;
+    __m512 ret;
  
-  for(;number < onesixteenthPoints; number++){
-    // Load the values
-    inputVal = _mm512_load_si512((__m512i*)inputPtr);
+    for (; number < onesixteenthPoints; number++) {
+        // Load the values
+        inputVal = _mm512_load_si512((__m512i*)inputPtr);
  
-    ret = _mm512_cvtepi32_ps(inputVal);
-    ret = _mm512_mul_ps(ret, invScalar);
+        ret = _mm512_cvtepi32_ps(inputVal);
+        ret = _mm512_mul_ps(ret, invScalar);
  
-    _mm512_store_ps(outputVectorPtr, ret);
+        _mm512_store_ps(outputVectorPtr, ret);
  
-    outputVectorPtr += 16;
-    inputPtr += 16;
-  }
+        outputVectorPtr += 16;
+        inputPtr += 16;
+    }
  
-  number = onesixteenthPoints * 16;
-  for(; number < num_points; number++){
-    outputVector[number] =((float)(inputVector[number])) * iScalar;
-  }
+    number = onesixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = ((float)(inputVector[number])) * iScalar;
+    }
  }
  #endif /* LV_HAVE_AVX512F */
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_32i_s32f_convert_32f_a_avx2(float* outputVector, const int32_t* inputVector,
-                                 const float scalar, unsigned int num_points)
+static inline void volk_32i_s32f_convert_32f_a_avx2(float* outputVector,
+                                                    const int32_t* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int oneEightPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int oneEightPoints = num_points / 8;
  
-  float* outputVectorPtr = outputVector;
-  const float iScalar = 1.0 / scalar;
-  __m256 invScalar = _mm256_set1_ps(iScalar);
-  int32_t* inputPtr = (int32_t*)inputVector;
-  __m256i inputVal;
-  __m256 ret;
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m256 invScalar = _mm256_set1_ps(iScalar);
+    int32_t* inputPtr = (int32_t*)inputVector;
+    __m256i inputVal;
+    __m256 ret;
  
-  for(;number < oneEightPoints; number++){
-    // Load the 4 values
-    inputVal = _mm256_load_si256((__m256i*)inputPtr);
+    for (; number < oneEightPoints; number++) {
+        // Load the 4 values
+        inputVal = _mm256_load_si256((__m256i*)inputPtr);
  
-    ret = _mm256_cvtepi32_ps(inputVal);
-    ret = _mm256_mul_ps(ret, invScalar);
+        ret = _mm256_cvtepi32_ps(inputVal);
+        ret = _mm256_mul_ps(ret, invScalar);
  
-    _mm256_store_ps(outputVectorPtr, ret);
+        _mm256_store_ps(outputVectorPtr, ret);
  
-    outputVectorPtr += 8;
-    inputPtr += 8;
-  }
+        outputVectorPtr += 8;
+        inputPtr += 8;
+    }
  
-  number = oneEightPoints * 8;
-  for(; number < num_points; number++){
-    outputVector[number] =((float)(inputVector[number])) * iScalar;
-  }
+    number = oneEightPoints * 8;
+    for (; number < num_points; number++) {
+        outputVector[number] = ((float)(inputVector[number])) * iScalar;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -286,59 +291,59 @@ volk_32i_s32f_convert_32f_a_avx2(float* outputVector, const int32_t* inputVector
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void
-volk_32i_s32f_convert_32f_a_sse2(float* outputVector, const int32_t* inputVector,
-                                 const float scalar, unsigned int num_points)
+static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector,
+                                                    const int32_t* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  float* outputVectorPtr = outputVector;
-  const float iScalar = 1.0 / scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-  int32_t* inputPtr = (int32_t*)inputVector;
-  __m128i inputVal;
-  __m128 ret;
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    int32_t* inputPtr = (int32_t*)inputVector;
+    __m128i inputVal;
+    __m128 ret;
  
-  for(;number < quarterPoints; number++){
-    // Load the 4 values
-    inputVal = _mm_load_si128((__m128i*)inputPtr);
+    for (; number < quarterPoints; number++) {
+        // Load the 4 values
+        inputVal = _mm_load_si128((__m128i*)inputPtr);
  
-    ret = _mm_cvtepi32_ps(inputVal);
-    ret = _mm_mul_ps(ret, invScalar);
+        ret = _mm_cvtepi32_ps(inputVal);
+        ret = _mm_mul_ps(ret, invScalar);
  
-    _mm_store_ps(outputVectorPtr, ret);
+        _mm_store_ps(outputVectorPtr, ret);
  
-    outputVectorPtr += 4;
-    inputPtr += 4;
-  }
+        outputVectorPtr += 4;
+        inputPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    outputVector[number] =((float)(inputVector[number])) * iScalar;
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        outputVector[number] = ((float)(inputVector[number])) * iScalar;
+    }
  }
  #endif /* LV_HAVE_SSE2 */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32i_s32f_convert_32f_a_generic(float* outputVector, const int32_t* inputVector,
-                                    const float scalar, unsigned int num_points)
+static inline void volk_32i_s32f_convert_32f_a_generic(float* outputVector,
+                                                       const int32_t* inputVector,
+                                                       const float scalar,
+                                                       unsigned int num_points)
  {
-  float* outputVectorPtr = outputVector;
-  const int32_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  const float iScalar = 1.0 / scalar;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
-  }
+    float* outputVectorPtr = outputVector;
+    const int32_t* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+    const float iScalar = 1.0 / scalar;
+
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
-
-
  #endif /* INCLUDED_volk_32i_s32f_convert_32f_a_H */
diff --git a/kernels/volk/volk_32i_x2_and_32i.h b/kernels/volk/volk_32i_x2_and_32i.h

index 76f01757a818653e079644197d0a35c37b0b6710..755cfdc514b4b8ba7815637e50ca86a80a6be479 100644 (file)
--- a/kernels/volk/volk_32i_x2_and_32i.h
+++ b/kernels/volk/volk_32i_x2_and_32i.h
@@ -29,8 +29,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32i_x2_and_32i(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points)
- * \endcode
+ * void volk_32i_x2_and_32i(int32_t* cVector, const int32_t* aVector, const int32_t*
+ * bVector, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li aVector: Input vector of samples.
@@ -87,72 +87,75 @@
  #ifdef LV_HAVE_AVX512F
  #include <immintrin.h>
  
-static inline void
-volk_32i_x2_and_32i_a_avx512f(int32_t* cVector, const int32_t* aVector,
-                          const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_and_32i_a_avx512f(int32_t* cVector,
+                                                 const int32_t* aVector,
+                                                 const int32_t* bVector,
+                                                 unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-  int32_t* cPtr = (int32_t*)cVector;
-  const int32_t* aPtr = (int32_t*)aVector;
-  const int32_t* bPtr = (int32_t*)bVector;
+    int32_t* cPtr = (int32_t*)cVector;
+    const int32_t* aPtr = (int32_t*)aVector;
+    const int32_t* bPtr = (int32_t*)bVector;
  
-  __m512i aVal, bVal, cVal;
-  for(;number < sixteenthPoints; number++){
+    __m512i aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
  
-    aVal = _mm512_load_si512(aPtr);
-    bVal = _mm512_load_si512(bPtr);
+        aVal = _mm512_load_si512(aPtr);
+        bVal = _mm512_load_si512(bPtr);
  
-    cVal = _mm512_and_si512(aVal, bVal);
+        cVal = _mm512_and_si512(aVal, bVal);
  
-    _mm512_store_si512(cPtr,cVal); // Store the results back into the C container
+        _mm512_store_si512(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 16;
-    bPtr += 16;
-    cPtr += 16;
-  }
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
  
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
-    cVector[number] = aVector[number] & bVector[number];
-  }
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        cVector[number] = aVector[number] & bVector[number];
+    }
  }
  #endif /* LV_HAVE_AVX512F */
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_32i_x2_and_32i_a_avx2(int32_t* cVector, const int32_t* aVector,
-                          const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_and_32i_a_avx2(int32_t* cVector,
+                                              const int32_t* aVector,
+                                              const int32_t* bVector,
+                                              unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int oneEightPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int oneEightPoints = num_points / 8;
  
-  int32_t* cPtr = cVector;
-  const int32_t* aPtr = aVector;
-  const int32_t* bPtr = bVector;
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr = bVector;
  
-  __m256i aVal, bVal, cVal;
-  for(;number < oneEightPoints; number++){
+    __m256i aVal, bVal, cVal;
+    for (; number < oneEightPoints; number++) {
  
-    aVal = _mm256_load_si256((__m256i*)aPtr);
-    bVal = _mm256_load_si256((__m256i*)bPtr);
+        aVal = _mm256_load_si256((__m256i*)aPtr);
+        bVal = _mm256_load_si256((__m256i*)bPtr);
  
-    cVal = _mm256_and_si256(aVal, bVal);
+        cVal = _mm256_and_si256(aVal, bVal);
  
-    _mm256_store_si256((__m256i*)cPtr,cVal); // Store the results back into the C container
+        _mm256_store_si256((__m256i*)cPtr,
+                           cVal); // Store the results back into the C container
  
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
  
-  number = oneEightPoints * 8;
-  for(;number < num_points; number++){
-    cVector[number] = aVector[number] & bVector[number];
-  }
+    number = oneEightPoints * 8;
+    for (; number < num_points; number++) {
+        cVector[number] = aVector[number] & bVector[number];
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -160,36 +163,37 @@ volk_32i_x2_and_32i_a_avx2(int32_t* cVector, const int32_t* aVector,
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32i_x2_and_32i_a_sse(int32_t* cVector, const int32_t* aVector,
-                          const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector,
+                                             const int32_t* aVector,
+                                             const int32_t* bVector,
+                                             unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  float* cPtr = (float*)cVector;
-  const float* aPtr = (float*)aVector;
-  const float* bPtr = (float*)bVector;
+    float* cPtr = (float*)cVector;
+    const float* aPtr = (float*)aVector;
+    const float* bPtr = (float*)bVector;
  
-  __m128 aVal, bVal, cVal;
-  for(;number < quarterPoints; number++){
+    __m128 aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
  
-    aVal = _mm_load_ps(aPtr);
-    bVal = _mm_load_ps(bPtr);
+        aVal = _mm_load_ps(aPtr);
+        bVal = _mm_load_ps(bPtr);
  
-    cVal = _mm_and_ps(aVal, bVal);
+        cVal = _mm_and_ps(aVal, bVal);
  
-    _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    cVector[number] = aVector[number] & bVector[number];
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        cVector[number] = aVector[number] & bVector[number];
+    }
  }
  #endif /* LV_HAVE_SSE */
  
@@ -197,62 +201,67 @@ volk_32i_x2_and_32i_a_sse(int32_t* cVector, const int32_t* aVector,
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_32i_x2_and_32i_neon(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_and_32i_neon(int32_t* cVector,
+                                            const int32_t* aVector,
+                                            const int32_t* bVector,
+                                            unsigned int num_points)
  {
-  int32_t* cPtr = cVector;
-  const int32_t* aPtr = aVector;
-  const int32_t* bPtr=  bVector;
-  unsigned int number = 0;
-  unsigned int quarter_points = num_points / 4;
-
-  int32x4_t a_val, b_val, c_val;
-
-  for(number = 0; number < quarter_points; number++){
-    a_val = vld1q_s32(aPtr);
-    b_val = vld1q_s32(bPtr);
-    c_val = vandq_s32(a_val, b_val);
-    vst1q_s32(cPtr, c_val);
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
-
-  for(number = quarter_points * 4; number < num_points; number++){
-    *cPtr++ = (*aPtr++) & (*bPtr++);
-  }
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr = bVector;
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+
+    int32x4_t a_val, b_val, c_val;
+
+    for (number = 0; number < quarter_points; number++) {
+        a_val = vld1q_s32(aPtr);
+        b_val = vld1q_s32(bPtr);
+        c_val = vandq_s32(a_val, b_val);
+        vst1q_s32(cPtr, c_val);
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) & (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_NEON */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32i_x2_and_32i_generic(int32_t* cVector, const int32_t* aVector,
-                            const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_and_32i_generic(int32_t* cVector,
+                                               const int32_t* aVector,
+                                               const int32_t* bVector,
+                                               unsigned int num_points)
  {
-  int32_t* cPtr = cVector;
-  const int32_t* aPtr = aVector;
-  const int32_t* bPtr=  bVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = (*aPtr++) & (*bPtr++);
-  }
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) & (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
  #ifdef LV_HAVE_ORC
-extern void
-volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector,
-                               const int32_t* bVector, unsigned int num_points);
-
-static inline void
-volk_32i_x2_and_32i_u_orc(int32_t* cVector, const int32_t* aVector,
-                          const int32_t* bVector, unsigned int num_points)
+extern void volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector,
+                                           const int32_t* aVector,
+                                           const int32_t* bVector,
+                                           unsigned int num_points);
+
+static inline void volk_32i_x2_and_32i_u_orc(int32_t* cVector,
+                                             const int32_t* aVector,
+                                             const int32_t* bVector,
+                                             unsigned int num_points)
  {
-  volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points);
+    volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points);
  }
  #endif /* LV_HAVE_ORC */
  
@@ -269,72 +278,75 @@ volk_32i_x2_and_32i_u_orc(int32_t* cVector, const int32_t* aVector,
  #ifdef LV_HAVE_AVX512F
  #include <immintrin.h>
  
-static inline void
-volk_32i_x2_and_32i_u_avx512f(int32_t* cVector, const int32_t* aVector,
-                          const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_and_32i_u_avx512f(int32_t* cVector,
+                                                 const int32_t* aVector,
+                                                 const int32_t* bVector,
+                                                 unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-  int32_t* cPtr = (int32_t*)cVector;
-  const int32_t* aPtr = (int32_t*)aVector;
-  const int32_t* bPtr = (int32_t*)bVector;
+    int32_t* cPtr = (int32_t*)cVector;
+    const int32_t* aPtr = (int32_t*)aVector;
+    const int32_t* bPtr = (int32_t*)bVector;
  
-  __m512i aVal, bVal, cVal;
-  for(;number < sixteenthPoints; number++){
+    __m512i aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
  
-    aVal = _mm512_loadu_si512(aPtr);
-    bVal = _mm512_loadu_si512(bPtr);
+        aVal = _mm512_loadu_si512(aPtr);
+        bVal = _mm512_loadu_si512(bPtr);
  
-    cVal = _mm512_and_si512(aVal, bVal);
+        cVal = _mm512_and_si512(aVal, bVal);
  
-    _mm512_storeu_si512(cPtr,cVal); // Store the results back into the C container
+        _mm512_storeu_si512(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 16;
-    bPtr += 16;
-    cPtr += 16;
-  }
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
  
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
-    cVector[number] = aVector[number] & bVector[number];
-  }
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        cVector[number] = aVector[number] & bVector[number];
+    }
  }
  #endif /* LV_HAVE_AVX512F */
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_32i_x2_and_32i_u_avx2(int32_t* cVector, const int32_t* aVector,
-                          const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_and_32i_u_avx2(int32_t* cVector,
+                                              const int32_t* aVector,
+                                              const int32_t* bVector,
+                                              unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int oneEightPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int oneEightPoints = num_points / 8;
  
-  int32_t* cPtr = cVector;
-  const int32_t* aPtr = aVector;
-  const int32_t* bPtr = bVector;
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr = bVector;
  
-  __m256i aVal, bVal, cVal;
-  for(;number < oneEightPoints; number++){
+    __m256i aVal, bVal, cVal;
+    for (; number < oneEightPoints; number++) {
  
-    aVal = _mm256_loadu_si256((__m256i*)aPtr);
-    bVal = _mm256_loadu_si256((__m256i*)bPtr);
+        aVal = _mm256_loadu_si256((__m256i*)aPtr);
+        bVal = _mm256_loadu_si256((__m256i*)bPtr);
  
-    cVal = _mm256_and_si256(aVal, bVal);
+        cVal = _mm256_and_si256(aVal, bVal);
  
-    _mm256_storeu_si256((__m256i*)cPtr,cVal); // Store the results back into the C container
+        _mm256_storeu_si256((__m256i*)cPtr,
+                            cVal); // Store the results back into the C container
  
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
  
-  number = oneEightPoints * 8;
-  for(;number < num_points; number++){
-    cVector[number] = aVector[number] & bVector[number];
-  }
+    number = oneEightPoints * 8;
+    for (; number < num_points; number++) {
+        cVector[number] = aVector[number] & bVector[number];
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
diff --git a/kernels/volk/volk_32i_x2_or_32i.h b/kernels/volk/volk_32i_x2_or_32i.h

index be4c086814115c936640c6f538d3964023b4b0f1..b03db89e0ce3e595cb4532b13dadc7305d1bf49d 100644 (file)
--- a/kernels/volk/volk_32i_x2_or_32i.h
+++ b/kernels/volk/volk_32i_x2_or_32i.h
@@ -29,8 +29,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_32i_x2_or_32i(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points)
- * \endcode
+ * void volk_32i_x2_or_32i(int32_t* cVector, const int32_t* aVector, const int32_t*
+ * bVector, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li aVector: Input vector of samples.
@@ -87,72 +87,75 @@
  #ifdef LV_HAVE_AVX512F
  #include <immintrin.h>
  
-static inline void
-volk_32i_x2_or_32i_a_avx512f(int32_t* cVector, const int32_t* aVector,
-                          const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_or_32i_a_avx512f(int32_t* cVector,
+                                                const int32_t* aVector,
+                                                const int32_t* bVector,
+                                                unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-  int32_t* cPtr = (int32_t*)cVector;
-  const int32_t* aPtr = (int32_t*)aVector;
-  const int32_t* bPtr = (int32_t*)bVector;
+    int32_t* cPtr = (int32_t*)cVector;
+    const int32_t* aPtr = (int32_t*)aVector;
+    const int32_t* bPtr = (int32_t*)bVector;
  
-  __m512i aVal, bVal, cVal;
-  for(;number < sixteenthPoints; number++){
+    __m512i aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
  
-    aVal = _mm512_load_si512(aPtr);
-    bVal = _mm512_load_si512(bPtr);
+        aVal = _mm512_load_si512(aPtr);
+        bVal = _mm512_load_si512(bPtr);
  
-    cVal = _mm512_or_si512(aVal, bVal);
+        cVal = _mm512_or_si512(aVal, bVal);
  
-    _mm512_store_si512(cPtr,cVal); // Store the results back into the C container
+        _mm512_store_si512(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 16;
-    bPtr += 16;
-    cPtr += 16;
-  }
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
  
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
-    cVector[number] = aVector[number] | bVector[number];
-  }
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        cVector[number] = aVector[number] | bVector[number];
+    }
  }
  #endif /* LV_HAVE_AVX512F */
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_32i_x2_or_32i_a_avx2(int32_t* cVector, const int32_t* aVector,
-                          const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_or_32i_a_avx2(int32_t* cVector,
+                                             const int32_t* aVector,
+                                             const int32_t* bVector,
+                                             unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int oneEightPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int oneEightPoints = num_points / 8;
  
-  int32_t* cPtr = cVector;
-  const int32_t* aPtr = aVector;
-  const int32_t* bPtr = bVector;
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr = bVector;
  
-  __m256i aVal, bVal, cVal;
-  for(;number < oneEightPoints; number++){
+    __m256i aVal, bVal, cVal;
+    for (; number < oneEightPoints; number++) {
  
-    aVal = _mm256_load_si256((__m256i*)aPtr);
-    bVal = _mm256_load_si256((__m256i*)bPtr);
+        aVal = _mm256_load_si256((__m256i*)aPtr);
+        bVal = _mm256_load_si256((__m256i*)bPtr);
  
-    cVal = _mm256_or_si256(aVal, bVal);
+        cVal = _mm256_or_si256(aVal, bVal);
  
-    _mm256_store_si256((__m256i*)cPtr,cVal); // Store the results back into the C container
+        _mm256_store_si256((__m256i*)cPtr,
+                           cVal); // Store the results back into the C container
  
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
  
-  number = oneEightPoints * 8;
-  for(;number < num_points; number++){
-    cVector[number] = aVector[number] | bVector[number];
-  }
+    number = oneEightPoints * 8;
+    for (; number < num_points; number++) {
+        cVector[number] = aVector[number] | bVector[number];
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -160,35 +163,36 @@ volk_32i_x2_or_32i_a_avx2(int32_t* cVector, const int32_t* aVector,
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVector,
-                         const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector,
+                                            const int32_t* aVector,
+                                            const int32_t* bVector,
+                                            unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  float* cPtr = (float*)cVector;
-  const float* aPtr = (float*)aVector;
-  const float* bPtr = (float*)bVector;
+    float* cPtr = (float*)cVector;
+    const float* aPtr = (float*)aVector;
+    const float* bPtr = (float*)bVector;
  
-  __m128 aVal, bVal, cVal;
-  for(;number < quarterPoints; number++){
-    aVal = _mm_load_ps(aPtr);
-    bVal = _mm_load_ps(bPtr);
+    __m128 aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        bVal = _mm_load_ps(bPtr);
  
-    cVal = _mm_or_ps(aVal, bVal);
+        cVal = _mm_or_ps(aVal, bVal);
  
-    _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    cVector[number] = aVector[number] | bVector[number];
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        cVector[number] = aVector[number] | bVector[number];
+    }
  }
  #endif /* LV_HAVE_SSE */
  
@@ -196,63 +200,67 @@ volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVector,
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_32i_x2_or_32i_neon(int32_t* cVector, const int32_t* aVector,
-                        const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_or_32i_neon(int32_t* cVector,
+                                           const int32_t* aVector,
+                                           const int32_t* bVector,
+                                           unsigned int num_points)
  {
-  int32_t* cPtr = cVector;
-  const int32_t* aPtr = aVector;
-  const int32_t* bPtr=  bVector;
-  unsigned int number = 0;
-  unsigned int quarter_points = num_points / 4;
-
-  int32x4_t a_val, b_val, c_val;
-
-  for(number = 0; number < quarter_points; number++){
-    a_val = vld1q_s32(aPtr);
-    b_val = vld1q_s32(bPtr);
-    c_val = vorrq_s32(a_val, b_val);
-    vst1q_s32(cPtr, c_val);
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
-
-  for(number = quarter_points * 4; number < num_points; number++){
-    *cPtr++ = (*aPtr++) | (*bPtr++);
-  }
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr = bVector;
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+
+    int32x4_t a_val, b_val, c_val;
+
+    for (number = 0; number < quarter_points; number++) {
+        a_val = vld1q_s32(aPtr);
+        b_val = vld1q_s32(bPtr);
+        c_val = vorrq_s32(a_val, b_val);
+        vst1q_s32(cPtr, c_val);
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) | (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_NEON */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32i_x2_or_32i_generic(int32_t* cVector, const int32_t* aVector,
-                           const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_or_32i_generic(int32_t* cVector,
+                                              const int32_t* aVector,
+                                              const int32_t* bVector,
+                                              unsigned int num_points)
  {
-  int32_t* cPtr = cVector;
-  const int32_t* aPtr = aVector;
-  const int32_t* bPtr=  bVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = (*aPtr++) | (*bPtr++);
-  }
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) | (*bPtr++);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
  #ifdef LV_HAVE_ORC
-extern void
-volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector,
-                              const int32_t* bVector, unsigned int num_points);
-
-static inline void
-volk_32i_x2_or_32i_u_orc(int32_t* cVector, const int32_t* aVector,
-                         const int32_t* bVector, unsigned int num_points)
+extern void volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector,
+                                          const int32_t* aVector,
+                                          const int32_t* bVector,
+                                          unsigned int num_points);
+
+static inline void volk_32i_x2_or_32i_u_orc(int32_t* cVector,
+                                            const int32_t* aVector,
+                                            const int32_t* bVector,
+                                            unsigned int num_points)
  {
-  volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points);
+    volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points);
  }
  #endif /* LV_HAVE_ORC */
  
@@ -269,72 +277,75 @@ volk_32i_x2_or_32i_u_orc(int32_t* cVector, const int32_t* aVector,
  #ifdef LV_HAVE_AVX512F
  #include <immintrin.h>
  
-static inline void
-volk_32i_x2_or_32i_u_avx512f(int32_t* cVector, const int32_t* aVector,
-                          const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_or_32i_u_avx512f(int32_t* cVector,
+                                                const int32_t* aVector,
+                                                const int32_t* bVector,
+                                                unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-  int32_t* cPtr = (int32_t*)cVector;
-  const int32_t* aPtr = (int32_t*)aVector;
-  const int32_t* bPtr = (int32_t*)bVector;
+    int32_t* cPtr = (int32_t*)cVector;
+    const int32_t* aPtr = (int32_t*)aVector;
+    const int32_t* bPtr = (int32_t*)bVector;
  
-  __m512i aVal, bVal, cVal;
-  for(;number < sixteenthPoints; number++){
+    __m512i aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
  
-    aVal = _mm512_loadu_si512(aPtr);
-    bVal = _mm512_loadu_si512(bPtr);
+        aVal = _mm512_loadu_si512(aPtr);
+        bVal = _mm512_loadu_si512(bPtr);
  
-    cVal = _mm512_or_si512(aVal, bVal);
+        cVal = _mm512_or_si512(aVal, bVal);
  
-    _mm512_storeu_si512(cPtr,cVal); // Store the results back into the C container
+        _mm512_storeu_si512(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 16;
-    bPtr += 16;
-    cPtr += 16;
-  }
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
  
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
-    cVector[number] = aVector[number] | bVector[number];
-  }
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        cVector[number] = aVector[number] | bVector[number];
+    }
  }
  #endif /* LV_HAVE_AVX512F */
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_32i_x2_or_32i_u_avx2(int32_t* cVector, const int32_t* aVector,
-                          const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_or_32i_u_avx2(int32_t* cVector,
+                                             const int32_t* aVector,
+                                             const int32_t* bVector,
+                                             unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int oneEightPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int oneEightPoints = num_points / 8;
  
-  int32_t* cPtr = cVector;
-  const int32_t* aPtr = aVector;
-  const int32_t* bPtr = bVector;
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr = bVector;
  
-  __m256i aVal, bVal, cVal;
-  for(;number < oneEightPoints; number++){
+    __m256i aVal, bVal, cVal;
+    for (; number < oneEightPoints; number++) {
  
-    aVal = _mm256_loadu_si256((__m256i*)aPtr);
-    bVal = _mm256_loadu_si256((__m256i*)bPtr);
+        aVal = _mm256_loadu_si256((__m256i*)aPtr);
+        bVal = _mm256_loadu_si256((__m256i*)bPtr);
  
-    cVal = _mm256_or_si256(aVal, bVal);
+        cVal = _mm256_or_si256(aVal, bVal);
  
-    _mm256_storeu_si256((__m256i*)cPtr,cVal); // Store the results back into the C container
+        _mm256_storeu_si256((__m256i*)cPtr,
+                            cVal); // Store the results back into the C container
  
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
  
-  number = oneEightPoints * 8;
-  for(;number < num_points; number++){
-    cVector[number] = aVector[number] | bVector[number];
-  }
+    number = oneEightPoints * 8;
+    for (; number < num_points; number++) {
+        cVector[number] = aVector[number] | bVector[number];
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
diff --git a/kernels/volk/volk_32u_byteswap.h b/kernels/volk/volk_32u_byteswap.h

index f5e6f11c0dada753831927fb88b0c8254fd149ee..185047cb4d4f6235f018dbbb6c9d244cd816c36b 100644 (file)
--- a/kernels/volk/volk_32u_byteswap.h
+++ b/kernels/volk/volk_32u_byteswap.h
@@ -71,38 +71,42 @@
  
  #if LV_HAVE_AVX2
  #include <immintrin.h>
-static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap, unsigned int num_points){
+static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap, unsigned int num_points)
+{
  
-  unsigned int number;
+    unsigned int number;
  
-  const unsigned int nPerSet = 8;
-  const uint64_t     nSets   = num_points / nPerSet;
+    const unsigned int nPerSet = 8;
+    const uint64_t nSets = num_points / nPerSet;
  
-  uint32_t* inputPtr = intsToSwap;
+    uint32_t* inputPtr = intsToSwap;
  
-  const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
+    const uint8_t shuffleVector[32] = { 3,  2,  1,  0,  7,  6,  5,  4,  11, 10, 9,
+                                        8,  15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
+                                        21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
  
-  const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector);
+    const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector);
  
-  for (number = 0 ;number < nSets; number++) {
+    for (number = 0; number < nSets; number++) {
  
-    // Load the 32t values, increment inputPtr later since we're doing it in-place.
-    const __m256i input  = _mm256_loadu_si256((__m256i*)inputPtr);
-    const __m256i output = _mm256_shuffle_epi8(input,myShuffle);
+        // Load the 32t values, increment inputPtr later since we're doing it in-place.
+        const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
+        const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
  
-    // Store the results
-    _mm256_storeu_si256((__m256i*)inputPtr, output);
-    inputPtr += nPerSet;
-  }
-  _mm256_zeroupper();
-
-  // Byteswap any remaining points:
-  for(number = nSets * nPerSet; number < num_points; number++){
-    uint32_t outputVal = *inputPtr;
-    outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
-    *inputPtr = outputVal;
-    inputPtr++;
-  }
+        // Store the results
+        _mm256_storeu_si256((__m256i*)inputPtr, output);
+        inputPtr += nPerSet;
+    }
+    _mm256_zeroupper();
+
+    // Byteswap any remaining points:
+    for (number = nSets * nPerSet; number < num_points; number++) {
+        uint32_t outputVal = *inputPtr;
+        outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
+                     ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+        *inputPtr = outputVal;
+        inputPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -110,42 +114,44 @@ static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap, unsigned int n
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points){
-  unsigned int number = 0;
-
-  uint32_t* inputPtr = intsToSwap;
-  __m128i input, byte1, byte2, byte3, byte4, output;
-  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
-  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
-
-  const uint64_t quarterPoints = num_points / 4;
-  for(;number < quarterPoints; number++){
-    // Load the 32t values, increment inputPtr later since we're doing it in-place.
-    input = _mm_loadu_si128((__m128i*)inputPtr);
-    // Do the four shifts
-    byte1 = _mm_slli_epi32(input, 24);
-    byte2 = _mm_slli_epi32(input, 8);
-    byte3 = _mm_srli_epi32(input, 8);
-    byte4 = _mm_srli_epi32(input, 24);
-    // Or bytes together
-    output = _mm_or_si128(byte1, byte4);
-    byte2 = _mm_and_si128(byte2, byte2mask);
-    output = _mm_or_si128(output, byte2);
-    byte3 = _mm_and_si128(byte3, byte3mask);
-    output = _mm_or_si128(output, byte3);
-    // Store the results
-    _mm_storeu_si128((__m128i*)inputPtr, output);
-    inputPtr += 4;
-  }
-
-  // Byteswap any remaining points:
-  number = quarterPoints*4;
-  for(; number < num_points; number++){
-    uint32_t outputVal = *inputPtr;
-    outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
-    *inputPtr = outputVal;
-    inputPtr++;
-  }
+static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    uint32_t* inputPtr = intsToSwap;
+    __m128i input, byte1, byte2, byte3, byte4, output;
+    __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+    __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+
+    const uint64_t quarterPoints = num_points / 4;
+    for (; number < quarterPoints; number++) {
+        // Load the 32t values, increment inputPtr later since we're doing it in-place.
+        input = _mm_loadu_si128((__m128i*)inputPtr);
+        // Do the four shifts
+        byte1 = _mm_slli_epi32(input, 24);
+        byte2 = _mm_slli_epi32(input, 8);
+        byte3 = _mm_srli_epi32(input, 8);
+        byte4 = _mm_srli_epi32(input, 24);
+        // Or bytes together
+        output = _mm_or_si128(byte1, byte4);
+        byte2 = _mm_and_si128(byte2, byte2mask);
+        output = _mm_or_si128(output, byte2);
+        byte3 = _mm_and_si128(byte3, byte3mask);
+        output = _mm_or_si128(output, byte3);
+        // Store the results
+        _mm_storeu_si128((__m128i*)inputPtr, output);
+        inputPtr += 4;
+    }
+
+    // Byteswap any remaining points:
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        uint32_t outputVal = *inputPtr;
+        outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
+                     ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+        *inputPtr = outputVal;
+        inputPtr++;
+    }
  }
  #endif /* LV_HAVE_SSE2 */
  
@@ -153,100 +159,106 @@ static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int n
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void volk_32u_byteswap_neon(uint32_t* intsToSwap, unsigned int num_points){
-  uint32_t* inputPtr = intsToSwap;
-  unsigned int number = 0;
-  unsigned int n8points = num_points / 8;
-
-  uint8x8x4_t input_table;
-  uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
-  uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
-
-  /* these magic numbers are used as byte-indices in the LUT.
-     they are pre-computed to save time. A simple C program
-     can calculate them; for example for lookup01:
-    uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
-    for(ii=0; ii < 8; ++ii) {
-        index += ((uint64_t)(*(chars+ii))) << (ii*8);
+static inline void volk_32u_byteswap_neon(uint32_t* intsToSwap, unsigned int num_points)
+{
+    uint32_t* inputPtr = intsToSwap;
+    unsigned int number = 0;
+    unsigned int n8points = num_points / 8;
+
+    uint8x8x4_t input_table;
+    uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
+    uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
+
+    /* these magic numbers are used as byte-indices in the LUT.
+       they are pre-computed to save time. A simple C program
+       can calculate them; for example for lookup01:
+      uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
+      for(ii=0; ii < 8; ++ii) {
+          index += ((uint64_t)(*(chars+ii))) << (ii*8);
+      }
+    */
+    int_lookup01 = vcreate_u8(74609667900706840);
+    int_lookup23 = vcreate_u8(219290013576860186);
+    int_lookup45 = vcreate_u8(363970359253013532);
+    int_lookup67 = vcreate_u8(508650704929166878);
+
+    for (number = 0; number < n8points; ++number) {
+        input_table = vld4_u8((uint8_t*)inputPtr);
+        swapped_int01 = vtbl4_u8(input_table, int_lookup01);
+        swapped_int23 = vtbl4_u8(input_table, int_lookup23);
+        swapped_int45 = vtbl4_u8(input_table, int_lookup45);
+        swapped_int67 = vtbl4_u8(input_table, int_lookup67);
+        vst1_u8((uint8_t*)inputPtr, swapped_int01);
+        vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23);
+        vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45);
+        vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67);
+
+        inputPtr += 8;
+    }
+
+    for (number = n8points * 8; number < num_points; ++number) {
+        uint32_t output = *inputPtr;
+        output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
+                  ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+
+        *inputPtr = output;
+        inputPtr++;
      }
-  */
-  int_lookup01 = vcreate_u8(74609667900706840);
-  int_lookup23 = vcreate_u8(219290013576860186);
-  int_lookup45 = vcreate_u8(363970359253013532);
-  int_lookup67 = vcreate_u8(508650704929166878);
-
-  for(number = 0; number < n8points; ++number){
-    input_table = vld4_u8((uint8_t*) inputPtr);
-    swapped_int01 = vtbl4_u8(input_table, int_lookup01);
-    swapped_int23 = vtbl4_u8(input_table, int_lookup23);
-    swapped_int45 = vtbl4_u8(input_table, int_lookup45);
-    swapped_int67 = vtbl4_u8(input_table, int_lookup67);
-    vst1_u8((uint8_t*) inputPtr, swapped_int01);
-    vst1_u8((uint8_t*) (inputPtr+2), swapped_int23);
-    vst1_u8((uint8_t*) (inputPtr+4), swapped_int45);
-    vst1_u8((uint8_t*) (inputPtr+6), swapped_int67);
-
-    inputPtr += 8;
-  }
-
-  for(number = n8points * 8; number < num_points; ++number){
-    uint32_t output = *inputPtr;
-    output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
-
-    *inputPtr = output;
-    inputPtr++;
-  }
  }
  #endif /* LV_HAVE_NEON */
  
  #ifdef LV_HAVE_NEONV8
  #include <arm_neon.h>
  
-static inline void volk_32u_byteswap_neonv8(uint32_t* intsToSwap, unsigned int num_points){
-  uint32_t* inputPtr = (uint32_t*)intsToSwap;
-  const unsigned int n8points = num_points / 8;
-  uint8x16_t input;
-  uint8x16_t idx = { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 };
-
-  unsigned int number = 0;
-  for(number = 0; number < n8points; ++number){
-    __VOLK_PREFETCH(inputPtr+8);
-    input = vld1q_u8((uint8_t*) inputPtr);
-    input = vqtbl1q_u8(input, idx);
-    vst1q_u8((uint8_t*) inputPtr, input);
-    inputPtr += 4;
-
-    input = vld1q_u8((uint8_t*) inputPtr);
-    input = vqtbl1q_u8(input, idx);
-    vst1q_u8((uint8_t*) inputPtr, input);
-    inputPtr += 4;
-  }
-
-  for(number = n8points * 8; number < num_points; ++number){
-    uint32_t output = *inputPtr;
+static inline void volk_32u_byteswap_neonv8(uint32_t* intsToSwap, unsigned int num_points)
+{
+    uint32_t* inputPtr = (uint32_t*)intsToSwap;
+    const unsigned int n8points = num_points / 8;
+    uint8x16_t input;
+    uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+
+    unsigned int number = 0;
+    for (number = 0; number < n8points; ++number) {
+        __VOLK_PREFETCH(inputPtr + 8);
+        input = vld1q_u8((uint8_t*)inputPtr);
+        input = vqtbl1q_u8(input, idx);
+        vst1q_u8((uint8_t*)inputPtr, input);
+        inputPtr += 4;
+
+        input = vld1q_u8((uint8_t*)inputPtr);
+        input = vqtbl1q_u8(input, idx);
+        vst1q_u8((uint8_t*)inputPtr, input);
+        inputPtr += 4;
+    }
  
-    output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+    for (number = n8points * 8; number < num_points; ++number) {
+        uint32_t output = *inputPtr;
  
-    *inputPtr++ = output;
-  }
+        output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
+                  ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
  
+        *inputPtr++ = output;
+    }
  }
  #endif /* LV_HAVE_NEONV8 */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap, unsigned int num_points){
-  uint32_t* inputPtr = intsToSwap;
+static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap,
+                                             unsigned int num_points)
+{
+    uint32_t* inputPtr = intsToSwap;
  
-  unsigned int point;
-  for(point = 0; point < num_points; point++){
-    uint32_t output = *inputPtr;
-    output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+    unsigned int point;
+    for (point = 0; point < num_points; point++) {
+        uint32_t output = *inputPtr;
+        output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
+                  ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
  
-    *inputPtr = output;
-    inputPtr++;
-  }
+        *inputPtr = output;
+        inputPtr++;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -261,38 +273,42 @@ static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap, unsigned int
  
  #if LV_HAVE_AVX2
  #include <immintrin.h>
-static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap, unsigned int num_points){
+static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap, unsigned int num_points)
+{
  
-  unsigned int number;
+    unsigned int number;
  
-  const unsigned int nPerSet = 8;
-  const uint64_t     nSets   = num_points / nPerSet;
+    const unsigned int nPerSet = 8;
+    const uint64_t nSets = num_points / nPerSet;
  
-  uint32_t* inputPtr = intsToSwap;
+    uint32_t* inputPtr = intsToSwap;
  
-  const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
+    const uint8_t shuffleVector[32] = { 3,  2,  1,  0,  7,  6,  5,  4,  11, 10, 9,
+                                        8,  15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
+                                        21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
  
-  const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector);
+    const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector);
  
-  for (number = 0 ;number < nSets; number++) {
+    for (number = 0; number < nSets; number++) {
  
-    // Load the 32t values, increment inputPtr later since we're doing it in-place.
-    const __m256i input  = _mm256_load_si256((__m256i*)inputPtr);
-    const __m256i output = _mm256_shuffle_epi8(input,myShuffle);
+        // Load the 32t values, increment inputPtr later since we're doing it in-place.
+        const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
+        const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
  
-    // Store the results
-    _mm256_store_si256((__m256i*)inputPtr, output);
-    inputPtr += nPerSet;
-  }
-  _mm256_zeroupper();
-
-  // Byteswap any remaining points:
-  for(number = nSets * nPerSet; number < num_points; number++){
-    uint32_t outputVal = *inputPtr;
-    outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
-    *inputPtr = outputVal;
-    inputPtr++;
-  }
+        // Store the results
+        _mm256_store_si256((__m256i*)inputPtr, output);
+        inputPtr += nPerSet;
+    }
+    _mm256_zeroupper();
+
+    // Byteswap any remaining points:
+    for (number = nSets * nPerSet; number < num_points; number++) {
+        uint32_t outputVal = *inputPtr;
+        outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
+                     ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+        *inputPtr = outputVal;
+        inputPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -301,63 +317,66 @@ static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap, unsigned int n
  #include <emmintrin.h>
  
  
-static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int num_points){
-  unsigned int number = 0;
-
-  uint32_t* inputPtr = intsToSwap;
-  __m128i input, byte1, byte2, byte3, byte4, output;
-  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
-  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
-
-  const uint64_t quarterPoints = num_points / 4;
-  for(;number < quarterPoints; number++){
-    // Load the 32t values, increment inputPtr later since we're doing it in-place.
-    input = _mm_load_si128((__m128i*)inputPtr);
-    // Do the four shifts
-    byte1 = _mm_slli_epi32(input, 24);
-    byte2 = _mm_slli_epi32(input, 8);
-    byte3 = _mm_srli_epi32(input, 8);
-    byte4 = _mm_srli_epi32(input, 24);
-    // Or bytes together
-    output = _mm_or_si128(byte1, byte4);
-    byte2 = _mm_and_si128(byte2, byte2mask);
-    output = _mm_or_si128(output, byte2);
-    byte3 = _mm_and_si128(byte3, byte3mask);
-    output = _mm_or_si128(output, byte3);
-    // Store the results
-    _mm_store_si128((__m128i*)inputPtr, output);
-    inputPtr += 4;
-  }
-
-  // Byteswap any remaining points:
-  number = quarterPoints*4;
-  for(; number < num_points; number++){
-    uint32_t outputVal = *inputPtr;
-    outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
-    *inputPtr = outputVal;
-    inputPtr++;
-  }
+static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    uint32_t* inputPtr = intsToSwap;
+    __m128i input, byte1, byte2, byte3, byte4, output;
+    __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+    __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+
+    const uint64_t quarterPoints = num_points / 4;
+    for (; number < quarterPoints; number++) {
+        // Load the 32t values, increment inputPtr later since we're doing it in-place.
+        input = _mm_load_si128((__m128i*)inputPtr);
+        // Do the four shifts
+        byte1 = _mm_slli_epi32(input, 24);
+        byte2 = _mm_slli_epi32(input, 8);
+        byte3 = _mm_srli_epi32(input, 8);
+        byte4 = _mm_srli_epi32(input, 24);
+        // Or bytes together
+        output = _mm_or_si128(byte1, byte4);
+        byte2 = _mm_and_si128(byte2, byte2mask);
+        output = _mm_or_si128(output, byte2);
+        byte3 = _mm_and_si128(byte3, byte3mask);
+        output = _mm_or_si128(output, byte3);
+        // Store the results
+        _mm_store_si128((__m128i*)inputPtr, output);
+        inputPtr += 4;
+    }
+
+    // Byteswap any remaining points:
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        uint32_t outputVal = *inputPtr;
+        outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
+                     ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+        *inputPtr = outputVal;
+        inputPtr++;
+    }
  }
  #endif /* LV_HAVE_SSE2 */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap, unsigned int num_points){
-  uint32_t* inputPtr = intsToSwap;
+static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap,
+                                               unsigned int num_points)
+{
+    uint32_t* inputPtr = intsToSwap;
  
-  unsigned int point;
-  for(point = 0; point < num_points; point++){
-    uint32_t output = *inputPtr;
-    output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+    unsigned int point;
+    for (point = 0; point < num_points; point++) {
+        uint32_t output = *inputPtr;
+        output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
+                  ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
  
-    *inputPtr = output;
-    inputPtr++;
-  }
+        *inputPtr = output;
+        inputPtr++;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
-
-
  #endif /* INCLUDED_volk_32u_byteswap_a_H */
diff --git a/kernels/volk/volk_32u_byteswappuppet_32u.h b/kernels/volk/volk_32u_byteswappuppet_32u.h

index c33a5fce27ab78b4385895364a0d613b5f99b647..ca5ca17709252e618c4462dab8c0f10cb5ecfd3d 100644 (file)
--- a/kernels/volk/volk_32u_byteswappuppet_32u.h
+++ b/kernels/volk/volk_32u_byteswappuppet_32u.h
@@ -1,70 +1,84 @@
  #ifndef INCLUDED_volk_32u_byteswappuppet_32u_H
  #define INCLUDED_volk_32u_byteswappuppet_32u_H
  
-#include <volk/volk_32u_byteswap.h>
  #include <stdint.h>
  #include <string.h>
+#include <volk/volk_32u_byteswap.h>
  
  #ifdef LV_HAVE_GENERIC
-static inline void volk_32u_byteswappuppet_32u_generic(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){
+static inline void volk_32u_byteswappuppet_32u_generic(uint32_t* output,
+                                                       uint32_t* intsToSwap,
+                                                       unsigned int num_points)
+{
  
      volk_32u_byteswap_generic((uint32_t*)intsToSwap, num_points);
      memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
-
  }
  #endif
  
  #ifdef LV_HAVE_NEON
-static inline void volk_32u_byteswappuppet_32u_neon(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){
+static inline void volk_32u_byteswappuppet_32u_neon(uint32_t* output,
+                                                    uint32_t* intsToSwap,
+                                                    unsigned int num_points)
+{
  
      volk_32u_byteswap_neon((uint32_t*)intsToSwap, num_points);
      memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
-
  }
  #endif
  
  #ifdef LV_HAVE_NEONV8
-static inline void volk_32u_byteswappuppet_32u_neonv8(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){
+static inline void volk_32u_byteswappuppet_32u_neonv8(uint32_t* output,
+                                                      uint32_t* intsToSwap,
+                                                      unsigned int num_points)
+{
  
      volk_32u_byteswap_neonv8((uint32_t*)intsToSwap, num_points);
      memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
-
  }
  #endif
  
  #ifdef LV_HAVE_SSE2
-static inline void volk_32u_byteswappuppet_32u_u_sse2(uint32_t *output, uint32_t* intsToSwap, unsigned int num_points){
+static inline void volk_32u_byteswappuppet_32u_u_sse2(uint32_t* output,
+                                                      uint32_t* intsToSwap,
+                                                      unsigned int num_points)
+{
  
      volk_32u_byteswap_u_sse2((uint32_t*)intsToSwap, num_points);
      memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
-
  }
  #endif
  
  #ifdef LV_HAVE_SSE2
-static inline void volk_32u_byteswappuppet_32u_a_sse2(uint32_t* output, uint32_t* intsToSwap, unsigned int num_points){
+static inline void volk_32u_byteswappuppet_32u_a_sse2(uint32_t* output,
+                                                      uint32_t* intsToSwap,
+                                                      unsigned int num_points)
+{
  
      volk_32u_byteswap_a_sse2((uint32_t*)intsToSwap, num_points);
      memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
-
  }
  #endif
  
  #ifdef LV_HAVE_AVX2
-static inline void volk_32u_byteswappuppet_32u_u_avx2(uint32_t* output, uint32_t* intsToSwap, unsigned int num_points){
+static inline void volk_32u_byteswappuppet_32u_u_avx2(uint32_t* output,
+                                                      uint32_t* intsToSwap,
+                                                      unsigned int num_points)
+{
  
      volk_32u_byteswap_u_avx2((uint32_t*)intsToSwap, num_points);
      memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
-
  }
  #endif
  
  #ifdef LV_HAVE_AVX2
-static inline void volk_32u_byteswappuppet_32u_a_avx2(uint32_t* output, uint32_t* intsToSwap, unsigned int num_points){
+static inline void volk_32u_byteswappuppet_32u_a_avx2(uint32_t* output,
+                                                      uint32_t* intsToSwap,
+                                                      unsigned int num_points)
+{
  
      volk_32u_byteswap_a_avx2((uint32_t*)intsToSwap, num_points);
      memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
-
  }
  #endif
  
diff --git a/kernels/volk/volk_32u_popcnt.h b/kernels/volk/volk_32u_popcnt.h

index 7aa4d433949b754d7110338f8786716462802a35..f6f0c109a6b2a73e2b3268d8590edfb3278cac30 100644 (file)
--- a/kernels/volk/volk_32u_popcnt.h
+++ b/kernels/volk/volk_32u_popcnt.h
@@ -56,24 +56,23 @@
  #ifndef INCLUDED_VOLK_32u_POPCNT_A16_H
  #define INCLUDED_VOLK_32u_POPCNT_A16_H
  
-#include <stdio.h>
  #include <inttypes.h>
+#include <stdio.h>
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value)
+static inline void volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value)
  {
-  // This is faster than a lookup table
-  uint32_t retVal = value;
+    // This is faster than a lookup table
+    uint32_t retVal = value;
  
-  retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
-  retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
-  retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
-  retVal = (retVal + (retVal >> 8));
-  retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+    retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+    retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+    retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+    retVal = (retVal + (retVal >> 8));
+    retVal = (retVal + (retVal >> 16)) & 0x0000003F;
  
-  *ret = retVal;
+    *ret = retVal;
  }
  
  #endif /*LV_HAVE_GENERIC*/
@@ -83,10 +82,9 @@ volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value)
  
  #include <nmmintrin.h>
  
-static inline void
-volk_32u_popcnt_a_sse4_2(uint32_t* ret, const uint32_t value)
+static inline void volk_32u_popcnt_a_sse4_2(uint32_t* ret, const uint32_t value)
  {
-  *ret = _mm_popcnt_u32(value);
+    *ret = _mm_popcnt_u32(value);
  }
  
  #endif /*LV_HAVE_SSE4_2*/
diff --git a/kernels/volk/volk_32u_popcntpuppet_32u.h b/kernels/volk/volk_32u_popcntpuppet_32u.h

index d5edd35b81c1151b994dcec1fd772bf40a5dbdf7..c0389cc5e9e0b60789fa3cab2b036e35412a8cce 100644 (file)
--- a/kernels/volk/volk_32u_popcntpuppet_32u.h
+++ b/kernels/volk/volk_32u_popcntpuppet_32u.h
@@ -27,19 +27,25 @@
  #include <volk/volk_32u_popcnt.h>
  
  #ifdef LV_HAVE_GENERIC
-static inline void volk_32u_popcntpuppet_32u_generic(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points){
+static inline void volk_32u_popcntpuppet_32u_generic(uint32_t* outVector,
+                                                     const uint32_t* inVector,
+                                                     unsigned int num_points)
+{
      unsigned int ii;
-    for(ii=0; ii < num_points; ++ii) {
-        volk_32u_popcnt_generic(outVector+ii, *(inVector+ii) );
+    for (ii = 0; ii < num_points; ++ii) {
+        volk_32u_popcnt_generic(outVector + ii, *(inVector + ii));
      }
  }
  #endif /* LV_HAVE_GENERIC */
  
  #ifdef LV_HAVE_SSE4_2
-static inline void volk_32u_popcntpuppet_32u_a_sse4_2(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points){
+static inline void volk_32u_popcntpuppet_32u_a_sse4_2(uint32_t* outVector,
+                                                      const uint32_t* inVector,
+                                                      unsigned int num_points)
+{
      unsigned int ii;
-    for(ii=0; ii < num_points; ++ii) {
-        volk_32u_popcnt_a_sse4_2(outVector+ii, *(inVector+ii) );
+    for (ii = 0; ii < num_points; ++ii) {
+        volk_32u_popcnt_a_sse4_2(outVector + ii, *(inVector + ii));
      }
  }
  #endif /* LV_HAVE_SSE4_2 */
diff --git a/kernels/volk/volk_32u_reverse_32u.h b/kernels/volk/volk_32u_reverse_32u.h

index b670b1333e30424f5554775e7f0a4306c666130e..aff0a9e0ead5157cd891aa84faf4352a2296f27d 100644 (file)
--- a/kernels/volk/volk_32u_reverse_32u.h
+++ b/kernels/volk/volk_32u_reverse_32u.h
@@ -24,7 +24,8 @@
   * \b bit reversal of the input 32 bit word
  
   * <b>Dispatcher Prototype</b>
- * \code volk_32u_reverse_32u(uint32_t *outputVector, uint32_t *inputVector; unsigned int num_points);
+ * \code volk_32u_reverse_32u(uint32_t *outputVector, uint32_t *inputVector; unsigned int
+ num_points);
   * \endcode
   *
   * \b Inputs
@@ -32,338 +33,344 @@
   * \li num_points The number of data points.
   *
   * \b Outputs
- * \li outputVector: The vector where the results will be stored, which is the bit-reversed input
+ * \li outputVector: The vector where the results will be stored, which is the
+ bit-reversed input
   *
   * \endcode
   */
  #ifndef INCLUDED_VOLK_32u_REVERSE_32u_U_H
  struct dword_split {
-  int b00: 1;
-  int b01: 1;
-  int b02: 1;
-  int b03: 1;
-  int b04: 1;
-  int b05: 1;
-  int b06: 1;
-  int b07: 1;
-  int b08: 1;
-  int b09: 1;
-  int b10: 1;
-  int b11: 1;
-  int b12: 1;
-  int b13: 1;
-  int b14: 1;
-  int b15: 1;
-  int b16: 1;
-  int b17: 1;
-  int b18: 1;
-  int b19: 1;
-  int b20: 1;
-  int b21: 1;
-  int b22: 1;
-  int b23: 1;
-  int b24: 1;
-  int b25: 1;
-  int b26: 1;
-  int b27: 1;
-  int b28: 1;
-  int b29: 1;
-  int b30: 1;
-  int b31: 1;
+    int b00 : 1;
+    int b01 : 1;
+    int b02 : 1;
+    int b03 : 1;
+    int b04 : 1;
+    int b05 : 1;
+    int b06 : 1;
+    int b07 : 1;
+    int b08 : 1;
+    int b09 : 1;
+    int b10 : 1;
+    int b11 : 1;
+    int b12 : 1;
+    int b13 : 1;
+    int b14 : 1;
+    int b15 : 1;
+    int b16 : 1;
+    int b17 : 1;
+    int b18 : 1;
+    int b19 : 1;
+    int b20 : 1;
+    int b21 : 1;
+    int b22 : 1;
+    int b23 : 1;
+    int b24 : 1;
+    int b25 : 1;
+    int b26 : 1;
+    int b27 : 1;
+    int b28 : 1;
+    int b29 : 1;
+    int b30 : 1;
+    int b31 : 1;
  };
  struct char_split {
-  uint8_t b00: 1;
-  uint8_t b01: 1;
-  uint8_t b02: 1;
-  uint8_t b03: 1;
-  uint8_t b04: 1;
-  uint8_t b05: 1;
-  uint8_t b06: 1;
-  uint8_t b07: 1;
+    uint8_t b00 : 1;
+    uint8_t b01 : 1;
+    uint8_t b02 : 1;
+    uint8_t b03 : 1;
+    uint8_t b04 : 1;
+    uint8_t b05 : 1;
+    uint8_t b06 : 1;
+    uint8_t b07 : 1;
  };
  
-//Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
-//http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
+// Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
+// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
  static const unsigned char BitReverseTable256[] = {
-  0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30,
-  0xB0, 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98,
-  0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64,
-  0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC,
-  0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02,
-  0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2,
-  0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A,
-  0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6,
-  0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, 0x0E, 0x8E, 0x4E, 0xCE, 0x2E,
-  0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE, 0x01, 0x81,
-  0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71,
-  0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9,
-  0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15,
-  0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD,
-  0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43,
-  0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,
-  0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B,
-  0xBB, 0x7B, 0xFB, 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97,
-  0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F,
-  0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF
+    0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0,
+    0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8,
+    0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94,
+    0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC,
+    0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2,
+    0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA,
+    0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86,
+    0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
+    0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE,
+    0x7E, 0xFE, 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1,
+    0x31, 0xB1, 0x71, 0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99,
+    0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5,
+    0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD,
+    0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43, 0xC3,
+    0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 0x0B, 0x8B,
+    0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
+    0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7,
+    0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF,
+    0x3F, 0xBF, 0x7F, 0xFF
  };
  #ifdef LV_HAVE_GENERIC
-static inline void volk_32u_reverse_32u_dword_shuffle(uint32_t* out, const uint32_t* in,
-                           unsigned int num_points)
+static inline void volk_32u_reverse_32u_dword_shuffle(uint32_t* out,
+                                                      const uint32_t* in,
+                                                      unsigned int num_points)
  {
-  const struct dword_split *in_ptr = (const struct dword_split*)in;
-  struct dword_split * out_ptr = (struct dword_split*)out;
-  unsigned int number = 0;
-  for(; number < num_points; ++number){
-    out_ptr->b00 = in_ptr->b31;
-    out_ptr->b01 = in_ptr->b30;
-    out_ptr->b02 = in_ptr->b29;
-    out_ptr->b03 = in_ptr->b28;
-    out_ptr->b04 = in_ptr->b27;
-    out_ptr->b05 = in_ptr->b26;
-    out_ptr->b06 = in_ptr->b25;
-    out_ptr->b07 = in_ptr->b24;
-    out_ptr->b08 = in_ptr->b23;
-    out_ptr->b09 = in_ptr->b22;
-    out_ptr->b10 = in_ptr->b21;
-    out_ptr->b11 = in_ptr->b20;
-    out_ptr->b12 = in_ptr->b19;
-    out_ptr->b13 = in_ptr->b18;
-    out_ptr->b14 = in_ptr->b17;
-    out_ptr->b15 = in_ptr->b16;
-    out_ptr->b16 = in_ptr->b15;
-    out_ptr->b17 = in_ptr->b14;
-    out_ptr->b18 = in_ptr->b13;
-    out_ptr->b19 = in_ptr->b12;
-    out_ptr->b20 = in_ptr->b11;
-    out_ptr->b21 = in_ptr->b10;
-    out_ptr->b22 = in_ptr->b09;
-    out_ptr->b23 = in_ptr->b08;
-    out_ptr->b24 = in_ptr->b07;
-    out_ptr->b25 = in_ptr->b06;
-    out_ptr->b26 = in_ptr->b05;
-    out_ptr->b27 = in_ptr->b04;
-    out_ptr->b28 = in_ptr->b03;
-    out_ptr->b29 = in_ptr->b02;
-    out_ptr->b30 = in_ptr->b01;
-    out_ptr->b31 = in_ptr->b00;
-    ++in_ptr;
-    ++out_ptr;
-  }
+    const struct dword_split* in_ptr = (const struct dword_split*)in;
+    struct dword_split* out_ptr = (struct dword_split*)out;
+    unsigned int number = 0;
+    for (; number < num_points; ++number) {
+        out_ptr->b00 = in_ptr->b31;
+        out_ptr->b01 = in_ptr->b30;
+        out_ptr->b02 = in_ptr->b29;
+        out_ptr->b03 = in_ptr->b28;
+        out_ptr->b04 = in_ptr->b27;
+        out_ptr->b05 = in_ptr->b26;
+        out_ptr->b06 = in_ptr->b25;
+        out_ptr->b07 = in_ptr->b24;
+        out_ptr->b08 = in_ptr->b23;
+        out_ptr->b09 = in_ptr->b22;
+        out_ptr->b10 = in_ptr->b21;
+        out_ptr->b11 = in_ptr->b20;
+        out_ptr->b12 = in_ptr->b19;
+        out_ptr->b13 = in_ptr->b18;
+        out_ptr->b14 = in_ptr->b17;
+        out_ptr->b15 = in_ptr->b16;
+        out_ptr->b16 = in_ptr->b15;
+        out_ptr->b17 = in_ptr->b14;
+        out_ptr->b18 = in_ptr->b13;
+        out_ptr->b19 = in_ptr->b12;
+        out_ptr->b20 = in_ptr->b11;
+        out_ptr->b21 = in_ptr->b10;
+        out_ptr->b22 = in_ptr->b09;
+        out_ptr->b23 = in_ptr->b08;
+        out_ptr->b24 = in_ptr->b07;
+        out_ptr->b25 = in_ptr->b06;
+        out_ptr->b26 = in_ptr->b05;
+        out_ptr->b27 = in_ptr->b04;
+        out_ptr->b28 = in_ptr->b03;
+        out_ptr->b29 = in_ptr->b02;
+        out_ptr->b30 = in_ptr->b01;
+        out_ptr->b31 = in_ptr->b00;
+        ++in_ptr;
+        ++out_ptr;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  #ifdef LV_HAVE_GENERIC
-static inline void volk_32u_reverse_32u_byte_shuffle(uint32_t* out, const uint32_t* in,
-                           unsigned int num_points)
+static inline void volk_32u_reverse_32u_byte_shuffle(uint32_t* out,
+                                                     const uint32_t* in,
+                                                     unsigned int num_points)
  {
-  const uint32_t *in_ptr = in;
-  uint32_t *out_ptr = out;
-  unsigned int number = 0;
-  for(; number < num_points; ++number){
-    const struct char_split *in8 = (const struct char_split*)in_ptr;
-    struct char_split *out8 = (struct char_split*)out_ptr;
+    const uint32_t* in_ptr = in;
+    uint32_t* out_ptr = out;
+    unsigned int number = 0;
+    for (; number < num_points; ++number) {
+        const struct char_split* in8 = (const struct char_split*)in_ptr;
+        struct char_split* out8 = (struct char_split*)out_ptr;
  
-    out8[3].b00 = in8[0].b07;
-    out8[3].b01 = in8[0].b06;
-    out8[3].b02 = in8[0].b05;
-    out8[3].b03 = in8[0].b04;
-    out8[3].b04 = in8[0].b03;
-    out8[3].b05 = in8[0].b02;
-    out8[3].b06 = in8[0].b01;
-    out8[3].b07 = in8[0].b00;
+        out8[3].b00 = in8[0].b07;
+        out8[3].b01 = in8[0].b06;
+        out8[3].b02 = in8[0].b05;
+        out8[3].b03 = in8[0].b04;
+        out8[3].b04 = in8[0].b03;
+        out8[3].b05 = in8[0].b02;
+        out8[3].b06 = in8[0].b01;
+        out8[3].b07 = in8[0].b00;
  
-    out8[2].b00 = in8[1].b07;
-    out8[2].b01 = in8[1].b06;
-    out8[2].b02 = in8[1].b05;
-    out8[2].b03 = in8[1].b04;
-    out8[2].b04 = in8[1].b03;
-    out8[2].b05 = in8[1].b02;
-    out8[2].b06 = in8[1].b01;
-    out8[2].b07 = in8[1].b00;
+        out8[2].b00 = in8[1].b07;
+        out8[2].b01 = in8[1].b06;
+        out8[2].b02 = in8[1].b05;
+        out8[2].b03 = in8[1].b04;
+        out8[2].b04 = in8[1].b03;
+        out8[2].b05 = in8[1].b02;
+        out8[2].b06 = in8[1].b01;
+        out8[2].b07 = in8[1].b00;
  
-    out8[1].b00 = in8[2].b07;
-    out8[1].b01 = in8[2].b06;
-    out8[1].b02 = in8[2].b05;
-    out8[1].b03 = in8[2].b04;
-    out8[1].b04 = in8[2].b03;
-    out8[1].b05 = in8[2].b02;
-    out8[1].b06 = in8[2].b01;
-    out8[1].b07 = in8[2].b00;
+        out8[1].b00 = in8[2].b07;
+        out8[1].b01 = in8[2].b06;
+        out8[1].b02 = in8[2].b05;
+        out8[1].b03 = in8[2].b04;
+        out8[1].b04 = in8[2].b03;
+        out8[1].b05 = in8[2].b02;
+        out8[1].b06 = in8[2].b01;
+        out8[1].b07 = in8[2].b00;
  
-    out8[0].b00 = in8[3].b07;
-    out8[0].b01 = in8[3].b06;
-    out8[0].b02 = in8[3].b05;
-    out8[0].b03 = in8[3].b04;
-    out8[0].b04 = in8[3].b03;
-    out8[0].b05 = in8[3].b02;
-    out8[0].b06 = in8[3].b01;
-    out8[0].b07 = in8[3].b00;
-    ++in_ptr;
-    ++out_ptr;
-  }
+        out8[0].b00 = in8[3].b07;
+        out8[0].b01 = in8[3].b06;
+        out8[0].b02 = in8[3].b05;
+        out8[0].b03 = in8[3].b04;
+        out8[0].b04 = in8[3].b03;
+        out8[0].b05 = in8[3].b02;
+        out8[0].b06 = in8[3].b01;
+        out8[0].b07 = in8[3].b00;
+        ++in_ptr;
+        ++out_ptr;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
-//Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
-//http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
+// Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
+// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
  #ifdef LV_HAVE_GENERIC
-static inline void volk_32u_reverse_32u_lut(uint32_t* out, const uint32_t* in,
-                           unsigned int num_points)
+static inline void
+volk_32u_reverse_32u_lut(uint32_t* out, const uint32_t* in, unsigned int num_points)
  {
-  const uint32_t *in_ptr = in;
-  uint32_t *out_ptr = out;
-  unsigned int number = 0;
-  for(; number < num_points; ++number){
-    *out_ptr =
-      (BitReverseTable256[*in_ptr & 0xff]         << 24) |
-      (BitReverseTable256[(*in_ptr >>  8) & 0xff] << 16) |
-      (BitReverseTable256[(*in_ptr >> 16) & 0xff] <<  8) |
-      (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
-    ++in_ptr;
-    ++out_ptr;
-  }
+    const uint32_t* in_ptr = in;
+    uint32_t* out_ptr = out;
+    unsigned int number = 0;
+    for (; number < num_points; ++number) {
+        *out_ptr = (BitReverseTable256[*in_ptr & 0xff] << 24) |
+                   (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) |
+                   (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) |
+                   (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
+        ++in_ptr;
+        ++out_ptr;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
-//Single-Byte code from "Bit Twiddling Hacks", which dedicates this method to public domain
-//http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
+// Single-Byte code from "Bit Twiddling Hacks", which dedicates this method to public
+// domain http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
  #ifdef LV_HAVE_GENERIC
-static inline void volk_32u_reverse_32u_2001magic(uint32_t* out, const uint32_t* in,
-                                           unsigned int num_points)
+static inline void
+volk_32u_reverse_32u_2001magic(uint32_t* out, const uint32_t* in, unsigned int num_points)
  {
-  const uint32_t *in_ptr = in;
-  uint32_t *out_ptr = out;
-  const uint8_t *in8;
-  uint8_t *out8;
-  unsigned int number = 0;
-  for(; number < num_points; ++number){
-    in8 = (const uint8_t*)in_ptr;
-    out8 = (uint8_t*)out_ptr;
-    out8[3] = ((in8[0] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
-    out8[2] = ((in8[1] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
-    out8[1] = ((in8[2] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
-    out8[0] = ((in8[3] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
-    ++in_ptr;
-    ++out_ptr;
-  }
+    const uint32_t* in_ptr = in;
+    uint32_t* out_ptr = out;
+    const uint8_t* in8;
+    uint8_t* out8;
+    unsigned int number = 0;
+    for (; number < num_points; ++number) {
+        in8 = (const uint8_t*)in_ptr;
+        out8 = (uint8_t*)out_ptr;
+        out8[3] = ((in8[0] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
+        out8[2] = ((in8[1] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
+        out8[1] = ((in8[2] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
+        out8[0] = ((in8[3] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
+        ++in_ptr;
+        ++out_ptr;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  #ifdef LV_HAVE_GENERIC
  // Current gr-pager implementation
-static inline void volk_32u_reverse_32u_1972magic(uint32_t* out, const uint32_t* in,
-                                                 unsigned int num_points)
+static inline void
+volk_32u_reverse_32u_1972magic(uint32_t* out, const uint32_t* in, unsigned int num_points)
  {
-  const uint32_t *in_ptr = in;
-  uint32_t *out_ptr = out;
-  const uint8_t *in8;
-  uint8_t *out8;
-  unsigned int number = 0;
-  for(; number < num_points; ++number){
-    in8 = (const uint8_t*)in_ptr;
-    out8 = (uint8_t*)out_ptr;
-    out8[3] =  (in8[0] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
-    out8[2] =  (in8[1] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
-    out8[1] =  (in8[2] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
-    out8[0] =  (in8[3] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
-    ++in_ptr;
-    ++out_ptr;
-  }
+    const uint32_t* in_ptr = in;
+    uint32_t* out_ptr = out;
+    const uint8_t* in8;
+    uint8_t* out8;
+    unsigned int number = 0;
+    for (; number < num_points; ++number) {
+        in8 = (const uint8_t*)in_ptr;
+        out8 = (uint8_t*)out_ptr;
+        out8[3] = (in8[0] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
+        out8[2] = (in8[1] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
+        out8[1] = (in8[2] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
+        out8[0] = (in8[3] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
+        ++in_ptr;
+        ++out_ptr;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
-//After lengthy thought and quite a bit of whiteboarding:
+// After lengthy thought and quite a bit of whiteboarding:
  #ifdef LV_HAVE_GENERIC
-static inline void volk_32u_reverse_32u_bintree_permute_top_down(uint32_t* out, const uint32_t* in,
-                                                 unsigned int num_points)
+static inline void volk_32u_reverse_32u_bintree_permute_top_down(uint32_t* out,
+                                                                 const uint32_t* in,
+                                                                 unsigned int num_points)
  {
-  const uint32_t *in_ptr = in;
-  uint32_t *out_ptr = out;
-  unsigned int number = 0;
-  for(; number < num_points; ++number){
-    uint32_t tmp = *in_ptr;
-    /* permute uint16:
-       The idea is to simply shift the lower 16 bit up, and the upper 16 bit down.
-     */
-    tmp = ( tmp << 16 ) | ( tmp >> 16 );
-    /* permute bytes:
-       shift up by 1 B first, then only consider even bytes, and OR with the unshifted even bytes
-     */
-    tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
-    /* permute 4bit tuples:
-       Same idea, but the "consideration" mask expression becomes unwieldy
-     */
-    tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) | ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
-    /* permute 2bit tuples:
-       Here, we collapsed the "consideration" mask to a simple hexmask: 0b0011 =
-       3; we need those every 4b, which coincides with a hex digit!
-    */
-    tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
-    /* permute odd/even:
-       0x01 = 0x1;  we need these every 2b, which works out: 0x01 | (0x01 << 2) = 0x05!
-     */
-    tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
+    const uint32_t* in_ptr = in;
+    uint32_t* out_ptr = out;
+    unsigned int number = 0;
+    for (; number < num_points; ++number) {
+        uint32_t tmp = *in_ptr;
+        /* permute uint16:
+           The idea is to simply shift the lower 16 bit up, and the upper 16 bit down.
+         */
+        tmp = (tmp << 16) | (tmp >> 16);
+        /* permute bytes:
+           shift up by 1 B first, then only consider even bytes, and OR with the unshifted
+           even bytes
+         */
+        tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
+        /* permute 4bit tuples:
+           Same idea, but the "consideration" mask expression becomes unwieldy
+         */
+        tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) |
+              ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
+        /* permute 2bit tuples:
+           Here, we collapsed the "consideration" mask to a simple hexmask: 0b0011 =
+           3; we need those every 4b, which coincides with a hex digit!
+        */
+        tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
+        /* permute odd/even:
+           0x01 = 0x1;  we need these every 2b, which works out: 0x01 | (0x01 << 2) =
+           0x05!
+         */
+        tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
  
-    *out_ptr = tmp;
-    ++in_ptr;
-    ++out_ptr;
-  }
+        *out_ptr = tmp;
+        ++in_ptr;
+        ++out_ptr;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  #ifdef LV_HAVE_GENERIC
-static inline void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t* out, const uint32_t* in,
-                                                 unsigned int num_points)
+static inline void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t* out,
+                                                                  const uint32_t* in,
+                                                                  unsigned int num_points)
  {
-  //same stuff as top_down, inverted order (permutation matrices don't care, you know!)
-  const uint32_t *in_ptr = in;
-  uint32_t *out_ptr = out;
-  unsigned int number = 0;
-  for(; number < num_points; ++number){
-    uint32_t tmp = *in_ptr;
-    tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
-    tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
-    tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) | ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
-    tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
-    tmp = ( tmp << 16 ) | ( tmp >> 16 );
+    // same stuff as top_down, inverted order (permutation matrices don't care, you know!)
+    const uint32_t* in_ptr = in;
+    uint32_t* out_ptr = out;
+    unsigned int number = 0;
+    for (; number < num_points; ++number) {
+        uint32_t tmp = *in_ptr;
+        tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
+        tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
+        tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) |
+              ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
+        tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
+        tmp = (tmp << 16) | (tmp >> 16);
  
-    *out_ptr = tmp;
-    ++in_ptr;
-    ++out_ptr;
-  }
+        *out_ptr = tmp;
+        ++in_ptr;
+        ++out_ptr;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  #ifdef LV_HAVE_NEONV8
  #include <arm_neon.h>
  
-static inline void volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in,
-                                              unsigned int num_points)
-{ 
-    const uint32_t *in_ptr = in;
-    uint32_t *out_ptr = out;
+static inline void
+volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in, unsigned int num_points)
+{
+    const uint32_t* in_ptr = in;
+    uint32_t* out_ptr = out;
  
-    const uint8x16_t idx = { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 };
+    const uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
  
-    const unsigned int quarterPoints = num_points/4;
+    const unsigned int quarterPoints = num_points / 4;
      unsigned int number = 0;
-    for(; number < quarterPoints; ++number){
-        __VOLK_PREFETCH(in_ptr+4);
-       uint32x4_t x = vld1q_u32(in_ptr);
-       uint32x4_t z = vreinterpretq_u32_u8(vqtbl1q_u8(vrbitq_u8(vreinterpretq_u8_u32 (x)),
-                                                      idx));
-       vst1q_u32 (out_ptr, z);
-       in_ptr  += 4;
-       out_ptr += 4;
+    for (; number < quarterPoints; ++number) {
+        __VOLK_PREFETCH(in_ptr + 4);
+        uint32x4_t x = vld1q_u32(in_ptr);
+        uint32x4_t z =
+            vreinterpretq_u32_u8(vqtbl1q_u8(vrbitq_u8(vreinterpretq_u8_u32(x)), idx));
+        vst1q_u32(out_ptr, z);
+        in_ptr += 4;
+        out_ptr += 4;
      }
-    number = quarterPoints*4;
-    for(; number < num_points; ++number){
-      *out_ptr =
-       (BitReverseTable256[*in_ptr & 0xff]         << 24) |
-       (BitReverseTable256[(*in_ptr >>  8) & 0xff] << 16) |
-       (BitReverseTable256[(*in_ptr >> 16) & 0xff] <<  8) |
-       (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
-      ++in_ptr;
-      ++out_ptr;
+    number = quarterPoints * 4;
+    for (; number < num_points; ++number) {
+        *out_ptr = (BitReverseTable256[*in_ptr & 0xff] << 24) |
+                   (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) |
+                   (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) |
+                   (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
+        ++in_ptr;
+        ++out_ptr;
      }
  }
  
@@ -371,29 +378,35 @@ static inline void volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-#define DO_RBIT                                        \
-  __VOLK_ASM("rbit %[result], %[value]"         \
-             : [result]"=r" (*out_ptr)          \
-             : [value] "r"  (*in_ptr)           \
-             : );                               \
-  in_ptr++;                                    \
-  out_ptr++;
+#define DO_RBIT                           \
+    __VOLK_ASM("rbit %[result], %[value]" \
+               : [result] "=r"(*out_ptr)  \
+               : [value] "r"(*in_ptr)     \
+               :);                        \
+    in_ptr++;                             \
+    out_ptr++;
  
-static inline void volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in,
-                                            unsigned int num_points)
+static inline void
+volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in, unsigned int num_points)
  {
  
-    const uint32_t *in_ptr = in;
-    uint32_t *out_ptr = out;
-    const unsigned int eighthPoints = num_points/8;
+    const uint32_t* in_ptr = in;
+    uint32_t* out_ptr = out;
+    const unsigned int eighthPoints = num_points / 8;
      unsigned int number = 0;
-    for(; number < eighthPoints; ++number){
-        __VOLK_PREFETCH(in_ptr+8);
-        DO_RBIT; DO_RBIT; DO_RBIT; DO_RBIT;
-        DO_RBIT; DO_RBIT; DO_RBIT; DO_RBIT;
+    for (; number < eighthPoints; ++number) {
+        __VOLK_PREFETCH(in_ptr + 8);
+        DO_RBIT;
+        DO_RBIT;
+        DO_RBIT;
+        DO_RBIT;
+        DO_RBIT;
+        DO_RBIT;
+        DO_RBIT;
+        DO_RBIT;
      }
-    number = eighthPoints*8;
-    for(; number < num_points; ++number){
+    number = eighthPoints * 8;
+    for (; number < num_points; ++number) {
          DO_RBIT;
      }
  }
@@ -403,4 +416,3 @@ static inline void volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in,
  
  
  #endif /* INCLUDED_volk_32u_reverse_32u_u_H */
-
diff --git a/kernels/volk/volk_64f_convert_32f.h b/kernels/volk/volk_64f_convert_32f.h

index 20422cf45ce83d8733e049273efaf84053393250..4ebccc0dbebd8260bcea3e0fa42cbeeafbf319c0 100644 (file)
--- a/kernels/volk/volk_64f_convert_32f.h
+++ b/kernels/volk/volk_64f_convert_32f.h
@@ -29,8 +29,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_64f_convert_32f(float* outputVector, const double* inputVector, unsigned int num_points)
- * \endcode
+ * void volk_64f_convert_32f(float* outputVector, const double* inputVector, unsigned int
+ * num_points) \endcode
   *
   * \b Inputs
   * \li inputVector: The vector of doubles to convert to floats.
@@ -70,34 +70,39 @@
  #ifdef LV_HAVE_AVX512F
  #include <immintrin.h>
  
-static inline void volk_64f_convert_32f_u_avx512f(float* outputVector, const double* inputVector, unsigned int num_points){
-  unsigned int number = 0;
+static inline void volk_64f_convert_32f_u_avx512f(float* outputVector,
+                                                  const double* inputVector,
+                                                  unsigned int num_points)
+{
+    unsigned int number = 0;
  
-  const unsigned int oneSixteenthPoints = num_points / 16;
+    const unsigned int oneSixteenthPoints = num_points / 16;
  
-  const double* inputVectorPtr = (const double*)inputVector;
-  float* outputVectorPtr = outputVector;
-  __m256 ret1, ret2;
-  __m512d inputVal1, inputVal2;
+    const double* inputVectorPtr = (const double*)inputVector;
+    float* outputVectorPtr = outputVector;
+    __m256 ret1, ret2;
+    __m512d inputVal1, inputVal2;
  
-  for(;number < oneSixteenthPoints; number++){
-    inputVal1 = _mm512_loadu_pd(inputVectorPtr); inputVectorPtr += 8;
-    inputVal2 = _mm512_loadu_pd(inputVectorPtr); inputVectorPtr += 8;
+    for (; number < oneSixteenthPoints; number++) {
+        inputVal1 = _mm512_loadu_pd(inputVectorPtr);
+        inputVectorPtr += 8;
+        inputVal2 = _mm512_loadu_pd(inputVectorPtr);
+        inputVectorPtr += 8;
  
-    ret1 = _mm512_cvtpd_ps(inputVal1);
-    ret2 = _mm512_cvtpd_ps(inputVal2);
+        ret1 = _mm512_cvtpd_ps(inputVal1);
+        ret2 = _mm512_cvtpd_ps(inputVal2);
  
-    _mm256_storeu_ps(outputVectorPtr, ret1);
-    outputVectorPtr += 8;
+        _mm256_storeu_ps(outputVectorPtr, ret1);
+        outputVectorPtr += 8;
  
-    _mm256_storeu_ps(outputVectorPtr, ret2);
-    outputVectorPtr += 8;
-  }
+        _mm256_storeu_ps(outputVectorPtr, ret2);
+        outputVectorPtr += 8;
+    }
  
-  number = oneSixteenthPoints * 16;
-  for(; number < num_points; number++){
-    outputVector[number] = (float)(inputVector[number]);
-  }
+    number = oneSixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = (float)(inputVector[number]);
+    }
  }
  #endif /* LV_HAVE_AVX512F */
  
@@ -105,34 +110,39 @@ static inline void volk_64f_convert_32f_u_avx512f(float* outputVector, const dou
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void volk_64f_convert_32f_u_avx(float* outputVector, const double* inputVector, unsigned int num_points){
-  unsigned int number = 0;
+static inline void volk_64f_convert_32f_u_avx(float* outputVector,
+                                              const double* inputVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
  
-  const unsigned int oneEightPoints = num_points / 8;
+    const unsigned int oneEightPoints = num_points / 8;
  
-  const double* inputVectorPtr = (const double*)inputVector;
-  float* outputVectorPtr = outputVector;
-  __m128 ret1, ret2;
-  __m256d inputVal1, inputVal2;
+    const double* inputVectorPtr = (const double*)inputVector;
+    float* outputVectorPtr = outputVector;
+    __m128 ret1, ret2;
+    __m256d inputVal1, inputVal2;
  
-  for(;number < oneEightPoints; number++){
-    inputVal1 = _mm256_loadu_pd(inputVectorPtr); inputVectorPtr += 4;
-    inputVal2 = _mm256_loadu_pd(inputVectorPtr); inputVectorPtr += 4;
+    for (; number < oneEightPoints; number++) {
+        inputVal1 = _mm256_loadu_pd(inputVectorPtr);
+        inputVectorPtr += 4;
+        inputVal2 = _mm256_loadu_pd(inputVectorPtr);
+        inputVectorPtr += 4;
  
-    ret1 = _mm256_cvtpd_ps(inputVal1);
-    ret2 = _mm256_cvtpd_ps(inputVal2);
+        ret1 = _mm256_cvtpd_ps(inputVal1);
+        ret2 = _mm256_cvtpd_ps(inputVal2);
  
-    _mm_storeu_ps(outputVectorPtr, ret1);
-    outputVectorPtr += 4;
+        _mm_storeu_ps(outputVectorPtr, ret1);
+        outputVectorPtr += 4;
  
-    _mm_storeu_ps(outputVectorPtr, ret2);
-    outputVectorPtr += 4;
-  }
+        _mm_storeu_ps(outputVectorPtr, ret2);
+        outputVectorPtr += 4;
+    }
  
-  number = oneEightPoints * 8;
-  for(; number < num_points; number++){
-    outputVector[number] = (float)(inputVector[number]);
-  }
+    number = oneEightPoints * 8;
+    for (; number < num_points; number++) {
+        outputVector[number] = (float)(inputVector[number]);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
@@ -140,53 +150,59 @@ static inline void volk_64f_convert_32f_u_avx(float* outputVector, const double*
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
-  unsigned int number = 0;
+static inline void volk_64f_convert_32f_u_sse2(float* outputVector,
+                                               const double* inputVector,
+                                               unsigned int num_points)
+{
+    unsigned int number = 0;
  
-  const unsigned int quarterPoints = num_points / 4;
+    const unsigned int quarterPoints = num_points / 4;
  
-  const double* inputVectorPtr = (const double*)inputVector;
-  float* outputVectorPtr = outputVector;
-  __m128 ret, ret2;
-  __m128d inputVal1, inputVal2;
+    const double* inputVectorPtr = (const double*)inputVector;
+    float* outputVectorPtr = outputVector;
+    __m128 ret, ret2;
+    __m128d inputVal1, inputVal2;
  
-  for(;number < quarterPoints; number++){
-    inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
-    inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
+    for (; number < quarterPoints; number++) {
+        inputVal1 = _mm_loadu_pd(inputVectorPtr);
+        inputVectorPtr += 2;
+        inputVal2 = _mm_loadu_pd(inputVectorPtr);
+        inputVectorPtr += 2;
  
-    ret = _mm_cvtpd_ps(inputVal1);
-    ret2 = _mm_cvtpd_ps(inputVal2);
+        ret = _mm_cvtpd_ps(inputVal1);
+        ret2 = _mm_cvtpd_ps(inputVal2);
  
-    ret = _mm_movelh_ps(ret, ret2);
+        ret = _mm_movelh_ps(ret, ret2);
  
-    _mm_storeu_ps(outputVectorPtr, ret);
-    outputVectorPtr += 4;
-  }
+        _mm_storeu_ps(outputVectorPtr, ret);
+        outputVectorPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    outputVector[number] = (float)(inputVector[number]);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        outputVector[number] = (float)(inputVector[number]);
+    }
  }
  #endif /* LV_HAVE_SSE2 */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_64f_convert_32f_generic(float* outputVector, const double* inputVector, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const double* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++));
-  }
+static inline void volk_64f_convert_32f_generic(float* outputVector,
+                                                const double* inputVector,
+                                                unsigned int num_points)
+{
+    float* outputVectorPtr = outputVector;
+    const double* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((float)(*inputVectorPtr++));
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
-
-
  #endif /* INCLUDED_volk_64f_convert_32f_u_H */
  #ifndef INCLUDED_volk_64f_convert_32f_a_H
  #define INCLUDED_volk_64f_convert_32f_a_H
@@ -197,34 +213,39 @@ static inline void volk_64f_convert_32f_generic(float* outputVector, const doubl
  #ifdef LV_HAVE_AVX512F
  #include <immintrin.h>
  
-static inline void volk_64f_convert_32f_a_avx512f(float* outputVector, const double* inputVector, unsigned int num_points){
-  unsigned int number = 0;
+static inline void volk_64f_convert_32f_a_avx512f(float* outputVector,
+                                                  const double* inputVector,
+                                                  unsigned int num_points)
+{
+    unsigned int number = 0;
  
-  const unsigned int oneSixteenthPoints = num_points / 16;
+    const unsigned int oneSixteenthPoints = num_points / 16;
  
-  const double* inputVectorPtr = (const double*)inputVector;
-  float* outputVectorPtr = outputVector;
-  __m256 ret1, ret2;
-  __m512d inputVal1, inputVal2;
+    const double* inputVectorPtr = (const double*)inputVector;
+    float* outputVectorPtr = outputVector;
+    __m256 ret1, ret2;
+    __m512d inputVal1, inputVal2;
  
-  for(;number < oneSixteenthPoints; number++){
-    inputVal1 = _mm512_load_pd(inputVectorPtr); inputVectorPtr += 8;
-    inputVal2 = _mm512_load_pd(inputVectorPtr); inputVectorPtr += 8;
+    for (; number < oneSixteenthPoints; number++) {
+        inputVal1 = _mm512_load_pd(inputVectorPtr);
+        inputVectorPtr += 8;
+        inputVal2 = _mm512_load_pd(inputVectorPtr);
+        inputVectorPtr += 8;
  
-    ret1 = _mm512_cvtpd_ps(inputVal1);
-    ret2 = _mm512_cvtpd_ps(inputVal2);
+        ret1 = _mm512_cvtpd_ps(inputVal1);
+        ret2 = _mm512_cvtpd_ps(inputVal2);
  
-    _mm256_store_ps(outputVectorPtr, ret1);
-    outputVectorPtr += 8;
+        _mm256_store_ps(outputVectorPtr, ret1);
+        outputVectorPtr += 8;
  
-    _mm256_store_ps(outputVectorPtr, ret2);
-    outputVectorPtr += 8;
-  }
+        _mm256_store_ps(outputVectorPtr, ret2);
+        outputVectorPtr += 8;
+    }
  
-  number = oneSixteenthPoints * 16;
-  for(; number < num_points; number++){
-    outputVector[number] = (float)(inputVector[number]);
-  }
+    number = oneSixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = (float)(inputVector[number]);
+    }
  }
  #endif /* LV_HAVE_AVX512F */
  
@@ -232,34 +253,39 @@ static inline void volk_64f_convert_32f_a_avx512f(float* outputVector, const dou
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void volk_64f_convert_32f_a_avx(float* outputVector, const double* inputVector, unsigned int num_points){
-  unsigned int number = 0;
+static inline void volk_64f_convert_32f_a_avx(float* outputVector,
+                                              const double* inputVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
  
-  const unsigned int oneEightPoints = num_points / 8;
+    const unsigned int oneEightPoints = num_points / 8;
  
-  const double* inputVectorPtr = (const double*)inputVector;
-  float* outputVectorPtr = outputVector;
-  __m128 ret1, ret2;
-  __m256d inputVal1, inputVal2;
+    const double* inputVectorPtr = (const double*)inputVector;
+    float* outputVectorPtr = outputVector;
+    __m128 ret1, ret2;
+    __m256d inputVal1, inputVal2;
  
-  for(;number < oneEightPoints; number++){
-    inputVal1 = _mm256_load_pd(inputVectorPtr); inputVectorPtr += 4;
-    inputVal2 = _mm256_load_pd(inputVectorPtr); inputVectorPtr += 4;
+    for (; number < oneEightPoints; number++) {
+        inputVal1 = _mm256_load_pd(inputVectorPtr);
+        inputVectorPtr += 4;
+        inputVal2 = _mm256_load_pd(inputVectorPtr);
+        inputVectorPtr += 4;
  
-    ret1 = _mm256_cvtpd_ps(inputVal1);
-    ret2 = _mm256_cvtpd_ps(inputVal2);
+        ret1 = _mm256_cvtpd_ps(inputVal1);
+        ret2 = _mm256_cvtpd_ps(inputVal2);
  
-    _mm_store_ps(outputVectorPtr, ret1);
-    outputVectorPtr += 4;
+        _mm_store_ps(outputVectorPtr, ret1);
+        outputVectorPtr += 4;
  
-    _mm_store_ps(outputVectorPtr, ret2);
-    outputVectorPtr += 4;
-  }
+        _mm_store_ps(outputVectorPtr, ret2);
+        outputVectorPtr += 4;
+    }
  
-  number = oneEightPoints * 8;
-  for(; number < num_points; number++){
-    outputVector[number] = (float)(inputVector[number]);
-  }
+    number = oneEightPoints * 8;
+    for (; number < num_points; number++) {
+        outputVector[number] = (float)(inputVector[number]);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
@@ -267,51 +293,57 @@ static inline void volk_64f_convert_32f_a_avx(float* outputVector, const double*
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
-  unsigned int number = 0;
+static inline void volk_64f_convert_32f_a_sse2(float* outputVector,
+                                               const double* inputVector,
+                                               unsigned int num_points)
+{
+    unsigned int number = 0;
  
-  const unsigned int quarterPoints = num_points / 4;
+    const unsigned int quarterPoints = num_points / 4;
  
-  const double* inputVectorPtr = (const double*)inputVector;
-  float* outputVectorPtr = outputVector;
-  __m128 ret, ret2;
-  __m128d inputVal1, inputVal2;
+    const double* inputVectorPtr = (const double*)inputVector;
+    float* outputVectorPtr = outputVector;
+    __m128 ret, ret2;
+    __m128d inputVal1, inputVal2;
  
-  for(;number < quarterPoints; number++){
-    inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
-    inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
+    for (; number < quarterPoints; number++) {
+        inputVal1 = _mm_load_pd(inputVectorPtr);
+        inputVectorPtr += 2;
+        inputVal2 = _mm_load_pd(inputVectorPtr);
+        inputVectorPtr += 2;
  
-    ret = _mm_cvtpd_ps(inputVal1);
-    ret2 = _mm_cvtpd_ps(inputVal2);
+        ret = _mm_cvtpd_ps(inputVal1);
+        ret2 = _mm_cvtpd_ps(inputVal2);
  
-    ret = _mm_movelh_ps(ret, ret2);
+        ret = _mm_movelh_ps(ret, ret2);
  
-    _mm_store_ps(outputVectorPtr, ret);
-    outputVectorPtr += 4;
-  }
+        _mm_store_ps(outputVectorPtr, ret);
+        outputVectorPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    outputVector[number] = (float)(inputVector[number]);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        outputVector[number] = (float)(inputVector[number]);
+    }
  }
  #endif /* LV_HAVE_SSE2 */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_64f_convert_32f_a_generic(float* outputVector, const double* inputVector, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const double* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++));
-  }
+static inline void volk_64f_convert_32f_a_generic(float* outputVector,
+                                                  const double* inputVector,
+                                                  unsigned int num_points)
+{
+    float* outputVectorPtr = outputVector;
+    const double* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((float)(*inputVectorPtr++));
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
-
-
  #endif /* INCLUDED_volk_64f_convert_32f_a_H */
diff --git a/kernels/volk/volk_64f_x2_add_64f.h b/kernels/volk/volk_64f_x2_add_64f.h

index 03b8e4c565a1ce6c2a6f72f2f931465cb3aff6a1..5c512cc9b0ee59c2919f233b337997145b8b2614 100644 (file)
--- a/kernels/volk/volk_64f_x2_add_64f.h
+++ b/kernels/volk/volk_64f_x2_add_64f.h
@@ -31,8 +31,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_64f_x2_add_64f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
- * \endcode
+ * void volk_64f_x2_add_64f(float* cVector, const float* aVector, const float* bVector,
+ * unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li aVector: First input vector.
@@ -76,18 +76,19 @@
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_64f_x2_add_64f_generic(double *cVector, const double *aVector,
-                                 const double *bVector, unsigned int num_points)
+static inline void volk_64f_x2_add_64f_generic(double* cVector,
+                                               const double* aVector,
+                                               const double* bVector,
+                                               unsigned int num_points)
  {
-  double *cPtr = cVector;
-  const double *aPtr = aVector;
-  const double *bPtr = bVector;
-  unsigned int number = 0;
-
-  for (number = 0; number < num_points; number++) {
-    *cPtr++ = (*aPtr++) + (*bPtr++);
-  }
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_GENERIC */
@@ -100,35 +101,36 @@ volk_64f_x2_add_64f_generic(double *cVector, const double *aVector,
  
  #include <emmintrin.h>
  
-static inline void
-volk_64f_x2_add_64f_u_sse2(double *cVector, const double *aVector,
-                                const double *bVector, unsigned int num_points)
+static inline void volk_64f_x2_add_64f_u_sse2(double* cVector,
+                                              const double* aVector,
+                                              const double* bVector,
+                                              unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int half_points = num_points / 2;
+    unsigned int number = 0;
+    const unsigned int half_points = num_points / 2;
  
-  double *cPtr = cVector;
-  const double *aPtr = aVector;
-  const double *bPtr = bVector;
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
  
-  __m128d aVal, bVal, cVal;
-  for (; number < half_points; number++) {
-    aVal = _mm_loadu_pd(aPtr);
-    bVal = _mm_loadu_pd(bPtr);
+    __m128d aVal, bVal, cVal;
+    for (; number < half_points; number++) {
+        aVal = _mm_loadu_pd(aPtr);
+        bVal = _mm_loadu_pd(bPtr);
  
-    cVal = _mm_add_pd(aVal, bVal);
+        cVal = _mm_add_pd(aVal, bVal);
  
-    _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container
+        _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 2;
-    bPtr += 2;
-    cPtr += 2;
-  }
+        aPtr += 2;
+        bPtr += 2;
+        cPtr += 2;
+    }
  
-  number = half_points * 2;
-  for (; number < num_points; number++) {
-    *cPtr++ = (*aPtr++) + (*bPtr++);
-  }
+    number = half_points * 2;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_SSE2 */
@@ -138,36 +140,37 @@ volk_64f_x2_add_64f_u_sse2(double *cVector, const double *aVector,
  
  #include <immintrin.h>
  
-static inline void
-volk_64f_x2_add_64f_u_avx(double *cVector, const double *aVector,
-                               const double *bVector, unsigned int num_points)
+static inline void volk_64f_x2_add_64f_u_avx(double* cVector,
+                                             const double* aVector,
+                                             const double* bVector,
+                                             unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarter_points = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarter_points = num_points / 4;
  
-  double *cPtr = cVector;
-  const double *aPtr = aVector;
-  const double *bPtr = bVector;
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
  
-  __m256d aVal, bVal, cVal;
-  for (; number < quarter_points; number++) {
+    __m256d aVal, bVal, cVal;
+    for (; number < quarter_points; number++) {
  
-    aVal = _mm256_loadu_pd(aPtr);
-    bVal = _mm256_loadu_pd(bPtr);
+        aVal = _mm256_loadu_pd(aPtr);
+        bVal = _mm256_loadu_pd(bPtr);
  
-    cVal = _mm256_add_pd(aVal, bVal);
+        cVal = _mm256_add_pd(aVal, bVal);
  
-    _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
+        _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarter_points * 4;
-  for (; number < num_points; number++) {
-    *cPtr++ = (*aPtr++) + (*bPtr++);
-  }
+    number = quarter_points * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX */
@@ -180,35 +183,36 @@ volk_64f_x2_add_64f_u_avx(double *cVector, const double *aVector,
  
  #include <emmintrin.h>
  
-static inline void
-volk_64f_x2_add_64f_a_sse2(double *cVector, const double *aVector,
-                                const double *bVector, unsigned int num_points)
+static inline void volk_64f_x2_add_64f_a_sse2(double* cVector,
+                                              const double* aVector,
+                                              const double* bVector,
+                                              unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int half_points = num_points / 2;
+    unsigned int number = 0;
+    const unsigned int half_points = num_points / 2;
  
-  double *cPtr = cVector;
-  const double *aPtr = aVector;
-  const double *bPtr = bVector;
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
  
-  __m128d aVal, bVal, cVal;
-  for (; number < half_points; number++) {
-    aVal = _mm_load_pd(aPtr);
-    bVal = _mm_load_pd(bPtr);
+    __m128d aVal, bVal, cVal;
+    for (; number < half_points; number++) {
+        aVal = _mm_load_pd(aPtr);
+        bVal = _mm_load_pd(bPtr);
  
-    cVal = _mm_add_pd(aVal, bVal);
+        cVal = _mm_add_pd(aVal, bVal);
  
-    _mm_store_pd(cPtr, cVal); // Store the results back into the C container
+        _mm_store_pd(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 2;
-    bPtr += 2;
-    cPtr += 2;
-  }
+        aPtr += 2;
+        bPtr += 2;
+        cPtr += 2;
+    }
  
-  number = half_points * 2;
-  for (; number < num_points; number++) {
-    *cPtr++ = (*aPtr++) + (*bPtr++);
-  }
+    number = half_points * 2;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_SSE2 */
@@ -218,36 +222,37 @@ volk_64f_x2_add_64f_a_sse2(double *cVector, const double *aVector,
  
  #include <immintrin.h>
  
-static inline void
-volk_64f_x2_add_64f_a_avx(double *cVector, const double *aVector,
-                               const double *bVector, unsigned int num_points)
+static inline void volk_64f_x2_add_64f_a_avx(double* cVector,
+                                             const double* aVector,
+                                             const double* bVector,
+                                             unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarter_points = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarter_points = num_points / 4;
  
-  double *cPtr = cVector;
-  const double *aPtr = aVector;
-  const double *bPtr = bVector;
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
  
-  __m256d aVal, bVal, cVal;
-  for (; number < quarter_points; number++) {
+    __m256d aVal, bVal, cVal;
+    for (; number < quarter_points; number++) {
  
-    aVal = _mm256_load_pd(aPtr);
-    bVal = _mm256_load_pd(bPtr);
+        aVal = _mm256_load_pd(aPtr);
+        bVal = _mm256_load_pd(bPtr);
  
-    cVal = _mm256_add_pd(aVal, bVal);
+        cVal = _mm256_add_pd(aVal, bVal);
  
-    _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
+        _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarter_points * 4;
-  for (; number < num_points; number++) {
-    *cPtr++ = (*aPtr++) + (*bPtr++);
-  }
+    number = quarter_points * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX */
diff --git a/kernels/volk/volk_64f_x2_max_64f.h b/kernels/volk/volk_64f_x2_max_64f.h

index d4464b77c38e5ba5b3dc61c4b679697bb55c3ca9..8f7f7430583051d803d69f95d7c9263a703f655b 100644 (file)
--- a/kernels/volk/volk_64f_x2_max_64f.h
+++ b/kernels/volk/volk_64f_x2_max_64f.h
@@ -32,8 +32,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_64f_x2_max_64f(double* cVector, const double* aVector, const double* bVector, unsigned int num_points)
- * \endcode
+ * void volk_64f_x2_max_64f(double* cVector, const double* aVector, const double* bVector,
+ * unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li aVector: First input vector.
@@ -77,38 +77,39 @@
  #ifdef LV_HAVE_AVX512F
  #include <immintrin.h>
  
-static inline void
-volk_64f_x2_max_64f_a_avx512f(double* cVector, const double* aVector,
-                           const double* bVector, unsigned int num_points)
+static inline void volk_64f_x2_max_64f_a_avx512f(double* cVector,
+                                                 const double* aVector,
+                                                 const double* bVector,
+                                                 unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eigthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eigthPoints = num_points / 8;
  
-  double* cPtr = cVector;
-  const double* aPtr = aVector;
-  const double* bPtr=  bVector;
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
  
-  __m512d aVal, bVal, cVal;
-  for(;number < eigthPoints; number++){
+    __m512d aVal, bVal, cVal;
+    for (; number < eigthPoints; number++) {
  
-    aVal = _mm512_load_pd(aPtr);
-    bVal = _mm512_load_pd(bPtr);
+        aVal = _mm512_load_pd(aPtr);
+        bVal = _mm512_load_pd(bPtr);
  
-    cVal = _mm512_max_pd(aVal, bVal);
+        cVal = _mm512_max_pd(aVal, bVal);
  
-    _mm512_store_pd(cPtr,cVal); // Store the results back into the C container
+        _mm512_store_pd(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
  
-  number = eigthPoints * 8;
-  for(;number < num_points; number++){
-    const double a = *aPtr++;
-    const double b = *bPtr++;
-    *cPtr++ = ( a > b ? a : b);
-  }
+    number = eigthPoints * 8;
+    for (; number < num_points; number++) {
+        const double a = *aPtr++;
+        const double b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
  }
  #endif /* LV_HAVE_AVX512F */
  
@@ -116,38 +117,39 @@ volk_64f_x2_max_64f_a_avx512f(double* cVector, const double* aVector,
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_64f_x2_max_64f_a_avx(double* cVector, const double* aVector,
-                           const double* bVector, unsigned int num_points)
+static inline void volk_64f_x2_max_64f_a_avx(double* cVector,
+                                             const double* aVector,
+                                             const double* bVector,
+                                             unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  double* cPtr = cVector;
-  const double* aPtr = aVector;
-  const double* bPtr=  bVector;
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
  
-  __m256d aVal, bVal, cVal;
-  for(;number < quarterPoints; number++){
+    __m256d aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
  
-    aVal = _mm256_load_pd(aPtr);
-    bVal = _mm256_load_pd(bPtr);
+        aVal = _mm256_load_pd(aPtr);
+        bVal = _mm256_load_pd(bPtr);
  
-    cVal = _mm256_max_pd(aVal, bVal);
+        cVal = _mm256_max_pd(aVal, bVal);
  
-    _mm256_store_pd(cPtr,cVal); // Store the results back into the C container
+        _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    const double a = *aPtr++;
-    const double b = *bPtr++;
-    *cPtr++ = ( a > b ? a : b);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        const double a = *aPtr++;
+        const double b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
@@ -155,58 +157,60 @@ volk_64f_x2_max_64f_a_avx(double* cVector, const double* aVector,
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void
-volk_64f_x2_max_64f_a_sse2(double* cVector, const double* aVector,
-                           const double* bVector, unsigned int num_points)
+static inline void volk_64f_x2_max_64f_a_sse2(double* cVector,
+                                              const double* aVector,
+                                              const double* bVector,
+                                              unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int halfPoints = num_points / 2;
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
  
-  double* cPtr = cVector;
-  const double* aPtr = aVector;
-  const double* bPtr=  bVector;
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
  
-  __m128d aVal, bVal, cVal;
-  for(;number < halfPoints; number++){
+    __m128d aVal, bVal, cVal;
+    for (; number < halfPoints; number++) {
  
-    aVal = _mm_load_pd(aPtr);
-    bVal = _mm_load_pd(bPtr);
+        aVal = _mm_load_pd(aPtr);
+        bVal = _mm_load_pd(bPtr);
  
-    cVal = _mm_max_pd(aVal, bVal);
+        cVal = _mm_max_pd(aVal, bVal);
  
-    _mm_store_pd(cPtr,cVal); // Store the results back into the C container
+        _mm_store_pd(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 2;
-    bPtr += 2;
-    cPtr += 2;
-  }
+        aPtr += 2;
+        bPtr += 2;
+        cPtr += 2;
+    }
  
-  number = halfPoints * 2;
-  for(;number < num_points; number++){
-    const double a = *aPtr++;
-    const double b = *bPtr++;
-    *cPtr++ = ( a > b ? a : b);
-  }
+    number = halfPoints * 2;
+    for (; number < num_points; number++) {
+        const double a = *aPtr++;
+        const double b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
  }
  #endif /* LV_HAVE_SSE2 */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_64f_x2_max_64f_generic(double* cVector, const double* aVector,
-                            const double* bVector, unsigned int num_points)
+static inline void volk_64f_x2_max_64f_generic(double* cVector,
+                                               const double* aVector,
+                                               const double* bVector,
+                                               unsigned int num_points)
  {
-  double* cPtr = cVector;
-  const double* aPtr = aVector;
-  const double* bPtr=  bVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    const double a = *aPtr++;
-    const double b = *bPtr++;
-    *cPtr++ = ( a > b ? a : b);
-  }
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        const double a = *aPtr++;
+        const double b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -223,38 +227,39 @@ volk_64f_x2_max_64f_generic(double* cVector, const double* aVector,
  #ifdef LV_HAVE_AVX512F
  #include <immintrin.h>
  
-static inline void
-volk_64f_x2_max_64f_u_avx512f(double* cVector, const double* aVector,
-                           const double* bVector, unsigned int num_points)
+static inline void volk_64f_x2_max_64f_u_avx512f(double* cVector,
+                                                 const double* aVector,
+                                                 const double* bVector,
+                                                 unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eigthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eigthPoints = num_points / 8;
  
-  double* cPtr = cVector;
-  const double* aPtr = aVector;
-  const double* bPtr=  bVector;
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
  
-  __m512d aVal, bVal, cVal;
-  for(;number < eigthPoints; number++){
+    __m512d aVal, bVal, cVal;
+    for (; number < eigthPoints; number++) {
  
-    aVal = _mm512_loadu_pd(aPtr);
-    bVal = _mm512_loadu_pd(bPtr);
+        aVal = _mm512_loadu_pd(aPtr);
+        bVal = _mm512_loadu_pd(bPtr);
  
-    cVal = _mm512_max_pd(aVal, bVal);
+        cVal = _mm512_max_pd(aVal, bVal);
  
-    _mm512_storeu_pd(cPtr,cVal); // Store the results back into the C container
+        _mm512_storeu_pd(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
  
-  number = eigthPoints * 8;
-  for(;number < num_points; number++){
-    const double a = *aPtr++;
-    const double b = *bPtr++;
-    *cPtr++ = ( a > b ? a : b);
-  }
+    number = eigthPoints * 8;
+    for (; number < num_points; number++) {
+        const double a = *aPtr++;
+        const double b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
  }
  #endif /* LV_HAVE_AVX512F */
  
@@ -262,38 +267,39 @@ volk_64f_x2_max_64f_u_avx512f(double* cVector, const double* aVector,
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_64f_x2_max_64f_u_avx(double* cVector, const double* aVector,
-                           const double* bVector, unsigned int num_points)
+static inline void volk_64f_x2_max_64f_u_avx(double* cVector,
+                                             const double* aVector,
+                                             const double* bVector,
+                                             unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  double* cPtr = cVector;
-  const double* aPtr = aVector;
-  const double* bPtr=  bVector;
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
  
-  __m256d aVal, bVal, cVal;
-  for(;number < quarterPoints; number++){
+    __m256d aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
  
-    aVal = _mm256_loadu_pd(aPtr);
-    bVal = _mm256_loadu_pd(bPtr);
+        aVal = _mm256_loadu_pd(aPtr);
+        bVal = _mm256_loadu_pd(bPtr);
  
-    cVal = _mm256_max_pd(aVal, bVal);
+        cVal = _mm256_max_pd(aVal, bVal);
  
-    _mm256_storeu_pd(cPtr,cVal); // Store the results back into the C container
+        _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    const double a = *aPtr++;
-    const double b = *bPtr++;
-    *cPtr++ = ( a > b ? a : b);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        const double a = *aPtr++;
+        const double b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
diff --git a/kernels/volk/volk_64f_x2_min_64f.h b/kernels/volk/volk_64f_x2_min_64f.h

index 0ffa305227560796b2f27080abcad566296e8853..7dc4d593d9ea4d943685857f3f5c8ff02616d672 100644 (file)
--- a/kernels/volk/volk_64f_x2_min_64f.h
+++ b/kernels/volk/volk_64f_x2_min_64f.h
@@ -32,7 +32,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_64f_x2_min_64f(double* cVector, const double* aVector, const double* bVector, unsigned int num_points)
+ * void volk_64f_x2_min_64f(double* cVector, const double* aVector, const double* bVector,
+ unsigned int num_points)
   * \endcode
   *
   * \b Inputs
@@ -77,38 +78,39 @@
  #ifdef LV_HAVE_AVX512F
  #include <immintrin.h>
  
-static inline void
-volk_64f_x2_min_64f_a_avx512f(double* cVector, const double* aVector,
-                           const double* bVector, unsigned int num_points)
+static inline void volk_64f_x2_min_64f_a_avx512f(double* cVector,
+                                                 const double* aVector,
+                                                 const double* bVector,
+                                                 unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eigthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eigthPoints = num_points / 8;
  
-  double* cPtr = cVector;
-  const double* aPtr = aVector;
-  const double* bPtr=  bVector;
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
  
-  __m512d aVal, bVal, cVal;
-  for(;number < eigthPoints; number++){
+    __m512d aVal, bVal, cVal;
+    for (; number < eigthPoints; number++) {
  
-    aVal = _mm512_load_pd(aPtr);
-    bVal = _mm512_load_pd(bPtr);
+        aVal = _mm512_load_pd(aPtr);
+        bVal = _mm512_load_pd(bPtr);
  
-    cVal = _mm512_min_pd(aVal, bVal);
+        cVal = _mm512_min_pd(aVal, bVal);
  
-    _mm512_store_pd(cPtr,cVal); // Store the results back into the C container
+        _mm512_store_pd(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
  
-  number = eigthPoints * 8;
-  for(;number < num_points; number++){
-    const double a = *aPtr++;
-    const double b = *bPtr++;
-    *cPtr++ = ( a < b ? a : b);
-  }
+    number = eigthPoints * 8;
+    for (; number < num_points; number++) {
+        const double a = *aPtr++;
+        const double b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
  }
  #endif /* LV_HAVE_AVX512F */
  
@@ -116,38 +118,39 @@ volk_64f_x2_min_64f_a_avx512f(double* cVector, const double* aVector,
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_64f_x2_min_64f_a_avx(double* cVector, const double* aVector,
-                           const double* bVector, unsigned int num_points)
+static inline void volk_64f_x2_min_64f_a_avx(double* cVector,
+                                             const double* aVector,
+                                             const double* bVector,
+                                             unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  double* cPtr = cVector;
-  const double* aPtr = aVector;
-  const double* bPtr=  bVector;
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
  
-  __m256d aVal, bVal, cVal;
-  for(;number < quarterPoints; number++){
+    __m256d aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
  
-    aVal = _mm256_load_pd(aPtr);
-    bVal = _mm256_load_pd(bPtr);
+        aVal = _mm256_load_pd(aPtr);
+        bVal = _mm256_load_pd(bPtr);
  
-    cVal = _mm256_min_pd(aVal, bVal);
+        cVal = _mm256_min_pd(aVal, bVal);
  
-    _mm256_store_pd(cPtr,cVal); // Store the results back into the C container
+        _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    const double a = *aPtr++;
-    const double b = *bPtr++;
-    *cPtr++ = ( a < b ? a : b);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        const double a = *aPtr++;
+        const double b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
@@ -155,58 +158,60 @@ volk_64f_x2_min_64f_a_avx(double* cVector, const double* aVector,
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void
-volk_64f_x2_min_64f_a_sse2(double* cVector, const double* aVector,
-                           const double* bVector, unsigned int num_points)
+static inline void volk_64f_x2_min_64f_a_sse2(double* cVector,
+                                              const double* aVector,
+                                              const double* bVector,
+                                              unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int halfPoints = num_points / 2;
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
  
-  double* cPtr = cVector;
-  const double* aPtr = aVector;
-  const double* bPtr=  bVector;
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
  
-  __m128d aVal, bVal, cVal;
-  for(;number < halfPoints; number++){
+    __m128d aVal, bVal, cVal;
+    for (; number < halfPoints; number++) {
  
-    aVal = _mm_load_pd(aPtr);
-    bVal = _mm_load_pd(bPtr);
+        aVal = _mm_load_pd(aPtr);
+        bVal = _mm_load_pd(bPtr);
  
-    cVal = _mm_min_pd(aVal, bVal);
+        cVal = _mm_min_pd(aVal, bVal);
  
-    _mm_store_pd(cPtr,cVal); // Store the results back into the C container
+        _mm_store_pd(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 2;
-    bPtr += 2;
-    cPtr += 2;
-  }
+        aPtr += 2;
+        bPtr += 2;
+        cPtr += 2;
+    }
  
-  number = halfPoints * 2;
-  for(;number < num_points; number++){
-    const double a = *aPtr++;
-    const double b = *bPtr++;
-    *cPtr++ = ( a < b ? a : b);
-  }
+    number = halfPoints * 2;
+    for (; number < num_points; number++) {
+        const double a = *aPtr++;
+        const double b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
  }
  #endif /* LV_HAVE_SSE2 */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_64f_x2_min_64f_generic(double* cVector, const double* aVector,
-                            const double* bVector, unsigned int num_points)
+static inline void volk_64f_x2_min_64f_generic(double* cVector,
+                                               const double* aVector,
+                                               const double* bVector,
+                                               unsigned int num_points)
  {
-  double* cPtr = cVector;
-  const double* aPtr = aVector;
-  const double* bPtr=  bVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    const double a = *aPtr++;
-    const double b = *bPtr++;
-    *cPtr++ = ( a < b ? a : b);
-  }
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        const double a = *aPtr++;
+        const double b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -222,38 +227,39 @@ volk_64f_x2_min_64f_generic(double* cVector, const double* aVector,
  #ifdef LV_HAVE_AVX512F
  #include <immintrin.h>
  
-static inline void
-volk_64f_x2_min_64f_u_avx512f(double* cVector, const double* aVector,
-                           const double* bVector, unsigned int num_points)
+static inline void volk_64f_x2_min_64f_u_avx512f(double* cVector,
+                                                 const double* aVector,
+                                                 const double* bVector,
+                                                 unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int eigthPoints = num_points / 8;
+    unsigned int number = 0;
+    const unsigned int eigthPoints = num_points / 8;
  
-  double* cPtr = cVector;
-  const double* aPtr = aVector;
-  const double* bPtr=  bVector;
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
  
-  __m512d aVal, bVal, cVal;
-  for(;number < eigthPoints; number++){
+    __m512d aVal, bVal, cVal;
+    for (; number < eigthPoints; number++) {
  
-    aVal = _mm512_loadu_pd(aPtr);
-    bVal = _mm512_loadu_pd(bPtr);
+        aVal = _mm512_loadu_pd(aPtr);
+        bVal = _mm512_loadu_pd(bPtr);
  
-    cVal = _mm512_min_pd(aVal, bVal);
+        cVal = _mm512_min_pd(aVal, bVal);
  
-    _mm512_storeu_pd(cPtr,cVal); // Store the results back into the C container
+        _mm512_storeu_pd(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 8;
-    bPtr += 8;
-    cPtr += 8;
-  }
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
  
-  number = eigthPoints * 8;
-  for(;number < num_points; number++){
-    const double a = *aPtr++;
-    const double b = *bPtr++;
-    *cPtr++ = ( a < b ? a : b);
-  }
+    number = eigthPoints * 8;
+    for (; number < num_points; number++) {
+        const double a = *aPtr++;
+        const double b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
  }
  #endif /* LV_HAVE_AVX512F */
  
@@ -261,38 +267,39 @@ volk_64f_x2_min_64f_u_avx512f(double* cVector, const double* aVector,
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_64f_x2_min_64f_u_avx(double* cVector, const double* aVector,
-                           const double* bVector, unsigned int num_points)
+static inline void volk_64f_x2_min_64f_u_avx(double* cVector,
+                                             const double* aVector,
+                                             const double* bVector,
+                                             unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
  
-  double* cPtr = cVector;
-  const double* aPtr = aVector;
-  const double* bPtr=  bVector;
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
  
-  __m256d aVal, bVal, cVal;
-  for(;number < quarterPoints; number++){
+    __m256d aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
  
-    aVal = _mm256_loadu_pd(aPtr);
-    bVal = _mm256_loadu_pd(bPtr);
+        aVal = _mm256_loadu_pd(aPtr);
+        bVal = _mm256_loadu_pd(bPtr);
  
-    cVal = _mm256_min_pd(aVal, bVal);
+        cVal = _mm256_min_pd(aVal, bVal);
  
-    _mm256_storeu_pd(cPtr,cVal); // Store the results back into the C container
+        _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    const double a = *aPtr++;
-    const double b = *bPtr++;
-    *cPtr++ = ( a < b ? a : b);
-  }
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        const double a = *aPtr++;
+        const double b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
  }
  #endif /* LV_HAVE_AVX */
  
diff --git a/kernels/volk/volk_64f_x2_multiply_64f.h b/kernels/volk/volk_64f_x2_multiply_64f.h

index 6fa9e8ec675ce1106e8399011350e656123cb7b0..39a155da7fc4168f229ba41d4c85a905b609757d 100644 (file)
--- a/kernels/volk/volk_64f_x2_multiply_64f.h
+++ b/kernels/volk/volk_64f_x2_multiply_64f.h
@@ -31,8 +31,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_64f_x2_multiply_64f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
- * \endcode
+ * void volk_64f_x2_multiply_64f(float* cVector, const float* aVector, const float*
+ * bVector, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li aVector: First input vector.
@@ -76,18 +76,19 @@
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_64f_x2_multiply_64f_generic(double *cVector, const double *aVector,
-                                 const double *bVector, unsigned int num_points)
+static inline void volk_64f_x2_multiply_64f_generic(double* cVector,
+                                                    const double* aVector,
+                                                    const double* bVector,
+                                                    unsigned int num_points)
  {
-  double *cPtr = cVector;
-  const double *aPtr = aVector;
-  const double *bPtr = bVector;
-  unsigned int number = 0;
-
-  for (number = 0; number < num_points; number++) {
-    *cPtr++ = (*aPtr++) * (*bPtr++);
-  }
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_GENERIC */
@@ -100,35 +101,36 @@ volk_64f_x2_multiply_64f_generic(double *cVector, const double *aVector,
  
  #include <emmintrin.h>
  
-static inline void
-volk_64f_x2_multiply_64f_u_sse2(double *cVector, const double *aVector,
-                                const double *bVector, unsigned int num_points)
+static inline void volk_64f_x2_multiply_64f_u_sse2(double* cVector,
+                                                   const double* aVector,
+                                                   const double* bVector,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int half_points = num_points / 2;
+    unsigned int number = 0;
+    const unsigned int half_points = num_points / 2;
  
-  double *cPtr = cVector;
-  const double *aPtr = aVector;
-  const double *bPtr = bVector;
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
  
-  __m128d aVal, bVal, cVal;
-  for (; number < half_points; number++) {
-    aVal = _mm_loadu_pd(aPtr);
-    bVal = _mm_loadu_pd(bPtr);
+    __m128d aVal, bVal, cVal;
+    for (; number < half_points; number++) {
+        aVal = _mm_loadu_pd(aPtr);
+        bVal = _mm_loadu_pd(bPtr);
  
-    cVal = _mm_mul_pd(aVal, bVal);
+        cVal = _mm_mul_pd(aVal, bVal);
  
-    _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container
+        _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 2;
-    bPtr += 2;
-    cPtr += 2;
-  }
+        aPtr += 2;
+        bPtr += 2;
+        cPtr += 2;
+    }
  
-  number = half_points * 2;
-  for (; number < num_points; number++) {
-    *cPtr++ = (*aPtr++) * (*bPtr++);
-  }
+    number = half_points * 2;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_SSE2 */
@@ -138,36 +140,37 @@ volk_64f_x2_multiply_64f_u_sse2(double *cVector, const double *aVector,
  
  #include <immintrin.h>
  
-static inline void
-volk_64f_x2_multiply_64f_u_avx(double *cVector, const double *aVector,
-                               const double *bVector, unsigned int num_points)
+static inline void volk_64f_x2_multiply_64f_u_avx(double* cVector,
+                                                  const double* aVector,
+                                                  const double* bVector,
+                                                  unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarter_points = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarter_points = num_points / 4;
  
-  double *cPtr = cVector;
-  const double *aPtr = aVector;
-  const double *bPtr = bVector;
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
  
-  __m256d aVal, bVal, cVal;
-  for (; number < quarter_points; number++) {
+    __m256d aVal, bVal, cVal;
+    for (; number < quarter_points; number++) {
  
-    aVal = _mm256_loadu_pd(aPtr);
-    bVal = _mm256_loadu_pd(bPtr);
+        aVal = _mm256_loadu_pd(aPtr);
+        bVal = _mm256_loadu_pd(bPtr);
  
-    cVal = _mm256_mul_pd(aVal, bVal);
+        cVal = _mm256_mul_pd(aVal, bVal);
  
-    _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
+        _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarter_points * 4;
-  for (; number < num_points; number++) {
-    *cPtr++ = (*aPtr++) * (*bPtr++);
-  }
+    number = quarter_points * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX */
@@ -180,35 +183,36 @@ volk_64f_x2_multiply_64f_u_avx(double *cVector, const double *aVector,
  
  #include <emmintrin.h>
  
-static inline void
-volk_64f_x2_multiply_64f_a_sse2(double *cVector, const double *aVector,
-                                const double *bVector, unsigned int num_points)
+static inline void volk_64f_x2_multiply_64f_a_sse2(double* cVector,
+                                                   const double* aVector,
+                                                   const double* bVector,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int half_points = num_points / 2;
+    unsigned int number = 0;
+    const unsigned int half_points = num_points / 2;
  
-  double *cPtr = cVector;
-  const double *aPtr = aVector;
-  const double *bPtr = bVector;
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
  
-  __m128d aVal, bVal, cVal;
-  for (; number < half_points; number++) {
-    aVal = _mm_load_pd(aPtr);
-    bVal = _mm_load_pd(bPtr);
+    __m128d aVal, bVal, cVal;
+    for (; number < half_points; number++) {
+        aVal = _mm_load_pd(aPtr);
+        bVal = _mm_load_pd(bPtr);
  
-    cVal = _mm_mul_pd(aVal, bVal);
+        cVal = _mm_mul_pd(aVal, bVal);
  
-    _mm_store_pd(cPtr, cVal); // Store the results back into the C container
+        _mm_store_pd(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 2;
-    bPtr += 2;
-    cPtr += 2;
-  }
+        aPtr += 2;
+        bPtr += 2;
+        cPtr += 2;
+    }
  
-  number = half_points * 2;
-  for (; number < num_points; number++) {
-    *cPtr++ = (*aPtr++) * (*bPtr++);
-  }
+    number = half_points * 2;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_SSE2 */
@@ -218,36 +222,37 @@ volk_64f_x2_multiply_64f_a_sse2(double *cVector, const double *aVector,
  
  #include <immintrin.h>
  
-static inline void
-volk_64f_x2_multiply_64f_a_avx(double *cVector, const double *aVector,
-                               const double *bVector, unsigned int num_points)
+static inline void volk_64f_x2_multiply_64f_a_avx(double* cVector,
+                                                  const double* aVector,
+                                                  const double* bVector,
+                                                  unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarter_points = num_points / 4;
+    unsigned int number = 0;
+    const unsigned int quarter_points = num_points / 4;
  
-  double *cPtr = cVector;
-  const double *aPtr = aVector;
-  const double *bPtr = bVector;
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
  
-  __m256d aVal, bVal, cVal;
-  for (; number < quarter_points; number++) {
+    __m256d aVal, bVal, cVal;
+    for (; number < quarter_points; number++) {
  
-    aVal = _mm256_load_pd(aPtr);
-    bVal = _mm256_load_pd(bPtr);
+        aVal = _mm256_load_pd(aPtr);
+        bVal = _mm256_load_pd(bPtr);
  
-    cVal = _mm256_mul_pd(aVal, bVal);
+        cVal = _mm256_mul_pd(aVal, bVal);
  
-    _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
+        _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
  
-    aPtr += 4;
-    bPtr += 4;
-    cPtr += 4;
-  }
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
  
-  number = quarter_points * 4;
-  for (; number < num_points; number++) {
-    *cPtr++ = (*aPtr++) * (*bPtr++);
-  }
+    number = quarter_points * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
  }
  
  #endif /* LV_HAVE_AVX */
diff --git a/kernels/volk/volk_64u_byteswap.h b/kernels/volk/volk_64u_byteswap.h

index 96e06617702ec5d49bf49b032f5883d3bf52b5dc..38621a43d563eeffb2a0544738eae793c3a13c09 100644 (file)
--- a/kernels/volk/volk_64u_byteswap.h
+++ b/kernels/volk/volk_64u_byteswap.h
@@ -72,71 +72,77 @@
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points){
+static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points)
+{
      uint32_t* inputPtr = (uint32_t*)intsToSwap;
      __m128i input, byte1, byte2, byte3, byte4, output;
      __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
      __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
      uint64_t number = 0;
      const unsigned int halfPoints = num_points / 2;
-    for(;number < halfPoints; number++){
-      // Load the 32t values, increment inputPtr later since we're doing it in-place.
-      input = _mm_loadu_si128((__m128i*)inputPtr);
-
-      // Do the four shifts
-      byte1 = _mm_slli_epi32(input, 24);
-      byte2 = _mm_slli_epi32(input, 8);
-      byte3 = _mm_srli_epi32(input, 8);
-      byte4 = _mm_srli_epi32(input, 24);
-      // Or bytes together
-      output = _mm_or_si128(byte1, byte4);
-      byte2 = _mm_and_si128(byte2, byte2mask);
-      output = _mm_or_si128(output, byte2);
-      byte3 = _mm_and_si128(byte3, byte3mask);
-      output = _mm_or_si128(output, byte3);
-
-      // Reorder the two words
-      output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
-
-      // Store the results
-      _mm_storeu_si128((__m128i*)inputPtr, output);
-      inputPtr += 4;
+    for (; number < halfPoints; number++) {
+        // Load the 32t values, increment inputPtr later since we're doing it in-place.
+        input = _mm_loadu_si128((__m128i*)inputPtr);
+
+        // Do the four shifts
+        byte1 = _mm_slli_epi32(input, 24);
+        byte2 = _mm_slli_epi32(input, 8);
+        byte3 = _mm_srli_epi32(input, 8);
+        byte4 = _mm_srli_epi32(input, 24);
+        // Or bytes together
+        output = _mm_or_si128(byte1, byte4);
+        byte2 = _mm_and_si128(byte2, byte2mask);
+        output = _mm_or_si128(output, byte2);
+        byte3 = _mm_and_si128(byte3, byte3mask);
+        output = _mm_or_si128(output, byte3);
+
+        // Reorder the two words
+        output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
+
+        // Store the results
+        _mm_storeu_si128((__m128i*)inputPtr, output);
+        inputPtr += 4;
      }
  
      // Byteswap any remaining points:
-    number = halfPoints*2;
-    for(; number < num_points; number++){
-    uint32_t output1 = *inputPtr;
-    uint32_t output2 = inputPtr[1];
+    number = halfPoints * 2;
+    for (; number < num_points; number++) {
+        uint32_t output1 = *inputPtr;
+        uint32_t output2 = inputPtr[1];
  
-    output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+        output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
+                   ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
  
-    output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+        output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
+                   ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
  
-    *inputPtr++ = output2;
-    *inputPtr++ = output1;
+        *inputPtr++ = output2;
+        *inputPtr++ = output1;
      }
  }
  #endif /* LV_HAVE_SSE2 */
  
  
-
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, unsigned int num_points){
-  uint32_t* inputPtr = (uint32_t*)intsToSwap;
-  unsigned int point;
-  for(point = 0; point < num_points; point++){
-    uint32_t output1 = *inputPtr;
-    uint32_t output2 = inputPtr[1];
+static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap,
+                                             unsigned int num_points)
+{
+    uint32_t* inputPtr = (uint32_t*)intsToSwap;
+    unsigned int point;
+    for (point = 0; point < num_points; point++) {
+        uint32_t output1 = *inputPtr;
+        uint32_t output2 = inputPtr[1];
  
-    output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+        output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
+                   ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
  
-    output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+        output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
+                   ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
  
-    *inputPtr++ = output2;
-    *inputPtr++ = output1;
-  }
+        *inputPtr++ = output2;
+        *inputPtr++ = output1;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -144,47 +150,47 @@ static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, unsigned int
  #include <immintrin.h>
  static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int num_points)
  {
-  unsigned int number = 0;
-
-  const unsigned int nPerSet = 4;
-  const uint64_t     nSets   = num_points / nPerSet;
+    unsigned int number = 0;
  
-  uint32_t* inputPtr = (uint32_t*)intsToSwap;
+    const unsigned int nPerSet = 4;
+    const uint64_t nSets = num_points / nPerSet;
  
-  const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
+    uint32_t* inputPtr = (uint32_t*)intsToSwap;
  
-  const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]);
+    const uint8_t shuffleVector[32] = { 7,  6,  5,  4,  3,  2,  1,  0,  15, 14, 13,
+                                        12, 11, 10, 9,  8,  23, 22, 21, 20, 19, 18,
+                                        17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
  
-  for ( ;number < nSets; number++ ) {
+    const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
  
-    // Load the 32t values, increment inputPtr later since we're doing it in-place.
-    const __m256i input  = _mm256_load_si256((__m256i*)inputPtr);
-    const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
+    for (; number < nSets; number++) {
  
-    // Store the results
-    _mm256_store_si256((__m256i*)inputPtr, output);
+        // Load the 32t values, increment inputPtr later since we're doing it in-place.
+        const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
+        const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
  
-    /*  inputPtr is 32bit so increment twice  */
-    inputPtr += 2 * nPerSet;
-  }
-  _mm256_zeroupper();
+        // Store the results
+        _mm256_store_si256((__m256i*)inputPtr, output);
  
-  // Byteswap any remaining points:
-  for(number = nSets * nPerSet; number < num_points; ++number ) {
-    uint32_t output1 = *inputPtr;
-    uint32_t output2 = inputPtr[1];
-    uint32_t out1 = ((((output1) >> 24) & 0x000000ff) |
-                    (((output1) >>  8) & 0x0000ff00) |
-                    (((output1) <<  8) & 0x00ff0000) |
-                    (((output1) << 24) & 0xff000000)   );
+        /*  inputPtr is 32bit so increment twice  */
+        inputPtr += 2 * nPerSet;
+    }
+    _mm256_zeroupper();
  
-    uint32_t out2 = ((((output2) >> 24) & 0x000000ff) |
-                    (((output2) >>  8) & 0x0000ff00) |
-                    (((output2) <<  8) & 0x00ff0000) |
-                    (((output2) << 24) & 0xff000000)   );
-    *inputPtr++ = out2;
-    *inputPtr++ = out1;
-  }
+    // Byteswap any remaining points:
+    for (number = nSets * nPerSet; number < num_points; ++number) {
+        uint32_t output1 = *inputPtr;
+        uint32_t output2 = inputPtr[1];
+        uint32_t out1 =
+            ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
+             (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
+
+        uint32_t out2 =
+            ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
+             (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
+        *inputPtr++ = out2;
+        *inputPtr++ = out1;
+    }
  }
  
  #endif /* LV_HAVE_AVX2 */
@@ -192,48 +198,47 @@ static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int n
  
  #if LV_HAVE_SSSE3
  #include <tmmintrin.h>
-static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap, unsigned int num_points)
+static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap,
+                                             unsigned int num_points)
  {
-  unsigned int number = 0;
+    unsigned int number = 0;
  
-  const unsigned int nPerSet = 2;
-  const uint64_t     nSets   = num_points / nPerSet;
+    const unsigned int nPerSet = 2;
+    const uint64_t nSets = num_points / nPerSet;
  
-  uint32_t* inputPtr = (uint32_t*)intsToSwap;
-
-  uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+    uint32_t* inputPtr = (uint32_t*)intsToSwap;
  
-  const __m128i myShuffle = _mm_loadu_si128((__m128i*) &shuffleVector);
+    uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
  
-  for ( ;number < nSets; number++ ) {
+    const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
  
-    // Load the 32t values, increment inputPtr later since we're doing it in-place.
-    const __m128i input  = _mm_load_si128((__m128i*)inputPtr);
-    const __m128i output = _mm_shuffle_epi8(input,myShuffle);
+    for (; number < nSets; number++) {
  
-    // Store the results
-    _mm_store_si128((__m128i*)inputPtr, output);
+        // Load the 32t values, increment inputPtr later since we're doing it in-place.
+        const __m128i input = _mm_load_si128((__m128i*)inputPtr);
+        const __m128i output = _mm_shuffle_epi8(input, myShuffle);
  
-    /*  inputPtr is 32bit so increment twice  */
-    inputPtr += 2 * nPerSet;
-  }
+        // Store the results
+        _mm_store_si128((__m128i*)inputPtr, output);
  
-  // Byteswap any remaining points:
-  for(number = nSets * nPerSet; number < num_points; ++number ) {
-    uint32_t output1 = *inputPtr;
-    uint32_t output2 = inputPtr[1];
-    uint32_t out1 = ((((output1) >> 24) & 0x000000ff) |
-                    (((output1) >>  8) & 0x0000ff00) |
-                    (((output1) <<  8) & 0x00ff0000) |
-                    (((output1) << 24) & 0xff000000)   );
+        /*  inputPtr is 32bit so increment twice  */
+        inputPtr += 2 * nPerSet;
+    }
  
-    uint32_t out2 = ((((output2) >> 24) & 0x000000ff) |
-                    (((output2) >>  8) & 0x0000ff00) |
-                    (((output2) <<  8) & 0x00ff0000) |
-                    (((output2) << 24) & 0xff000000)   );
-    *inputPtr++ = out2;
-    *inputPtr++ = out1;
-  }
+    // Byteswap any remaining points:
+    for (number = nSets * nPerSet; number < num_points; ++number) {
+        uint32_t output1 = *inputPtr;
+        uint32_t output2 = inputPtr[1];
+        uint32_t out1 =
+            ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
+             (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
+
+        uint32_t out2 =
+            ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
+             (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
+        *inputPtr++ = out2;
+        *inputPtr++ = out1;
+    }
  }
  #endif /* LV_HAVE_SSSE3 */
  
@@ -241,86 +246,90 @@ static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap, unsigned int
  #ifdef LV_HAVE_NEONV8
  #include <arm_neon.h>
  
-static inline void volk_64u_byteswap_neonv8(uint64_t* intsToSwap, unsigned int num_points){
-  uint32_t* inputPtr = (uint32_t*)intsToSwap;
-  const unsigned int n4points = num_points / 4;
-  uint8x16x2_t input;
-  uint8x16_t idx = { 7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8 };
-
-  unsigned int number = 0;
-  for(number = 0; number < n4points; ++number){
-    __VOLK_PREFETCH(inputPtr+8);
-    input = vld2q_u8((uint8_t*) inputPtr);
-    input.val[0] = vqtbl1q_u8(input.val[0], idx);
-    input.val[1] = vqtbl1q_u8(input.val[1], idx);
-    vst2q_u8((uint8_t*) inputPtr, input);
-
-    inputPtr += 8;
-  }
-
-  for(number = n4points * 4; number < num_points; ++number){
-    uint32_t output1 = *inputPtr;
-    uint32_t output2 =  inputPtr[1];
+static inline void volk_64u_byteswap_neonv8(uint64_t* intsToSwap, unsigned int num_points)
+{
+    uint32_t* inputPtr = (uint32_t*)intsToSwap;
+    const unsigned int n4points = num_points / 4;
+    uint8x16x2_t input;
+    uint8x16_t idx = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+
+    unsigned int number = 0;
+    for (number = 0; number < n4points; ++number) {
+        __VOLK_PREFETCH(inputPtr + 8);
+        input = vld2q_u8((uint8_t*)inputPtr);
+        input.val[0] = vqtbl1q_u8(input.val[0], idx);
+        input.val[1] = vqtbl1q_u8(input.val[1], idx);
+        vst2q_u8((uint8_t*)inputPtr, input);
+
+        inputPtr += 8;
+    }
  
-    output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
-    output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+    for (number = n4points * 4; number < num_points; ++number) {
+        uint32_t output1 = *inputPtr;
+        uint32_t output2 = inputPtr[1];
  
-    *inputPtr++ = output2;
-    *inputPtr++ = output1;
-  }
+        output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
+                   ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+        output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
+                   ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
  
+        *inputPtr++ = output2;
+        *inputPtr++ = output1;
+    }
  }
  #else
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points){
-  uint32_t* inputPtr = (uint32_t*)intsToSwap;
-  unsigned int number = 0;
-  unsigned int n8points = num_points / 4;
-
-  uint8x8x4_t input_table;
-  uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
-  uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
-
-  /* these magic numbers are used as byte-indices in the LUT.
-     they are pre-computed to save time. A simple C program
-     can calculate them; for example for lookup01:
-    uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
-    for(ii=0; ii < 8; ++ii) {
-        index += ((uint64_t)(*(chars+ii))) << (ii*8);
+static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points)
+{
+    uint32_t* inputPtr = (uint32_t*)intsToSwap;
+    unsigned int number = 0;
+    unsigned int n8points = num_points / 4;
+
+    uint8x8x4_t input_table;
+    uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
+    uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
+
+    /* these magic numbers are used as byte-indices in the LUT.
+       they are pre-computed to save time. A simple C program
+       can calculate them; for example for lookup01:
+      uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
+      for(ii=0; ii < 8; ++ii) {
+          index += ((uint64_t)(*(chars+ii))) << (ii*8);
+      }
+    */
+    int_lookup01 = vcreate_u8(2269495096316185);
+    int_lookup23 = vcreate_u8(146949840772469531);
+    int_lookup45 = vcreate_u8(291630186448622877);
+    int_lookup67 = vcreate_u8(436310532124776223);
+
+    for (number = 0; number < n8points; ++number) {
+        input_table = vld4_u8((uint8_t*)inputPtr);
+        swapped_int01 = vtbl4_u8(input_table, int_lookup01);
+        swapped_int23 = vtbl4_u8(input_table, int_lookup23);
+        swapped_int45 = vtbl4_u8(input_table, int_lookup45);
+        swapped_int67 = vtbl4_u8(input_table, int_lookup67);
+        vst1_u8((uint8_t*)inputPtr, swapped_int01);
+        vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23);
+        vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45);
+        vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67);
+
+        inputPtr += 4;
      }
-  */
-  int_lookup01 = vcreate_u8(2269495096316185);
-  int_lookup23 = vcreate_u8(146949840772469531);
-  int_lookup45 = vcreate_u8(291630186448622877);
-  int_lookup67 = vcreate_u8(436310532124776223);
-
-  for(number = 0; number < n8points; ++number){
-    input_table = vld4_u8((uint8_t*) inputPtr);
-    swapped_int01 = vtbl4_u8(input_table, int_lookup01);
-    swapped_int23 = vtbl4_u8(input_table, int_lookup23);
-    swapped_int45 = vtbl4_u8(input_table, int_lookup45);
-    swapped_int67 = vtbl4_u8(input_table, int_lookup67);
-    vst1_u8((uint8_t*) inputPtr, swapped_int01);
-    vst1_u8((uint8_t*) (inputPtr+2), swapped_int23);
-    vst1_u8((uint8_t*) (inputPtr+4), swapped_int45);
-    vst1_u8((uint8_t*) (inputPtr+6), swapped_int67);
-
-    inputPtr += 4;
-  }
-
-  for(number = n8points * 4; number < num_points; ++number){
-    uint32_t output1 = *inputPtr;
-    uint32_t output2 = inputPtr[1];
-
-    output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
-    output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
-
-    *inputPtr++ = output2;
-    *inputPtr++ = output1;
-  }
  
+    for (number = n8points * 4; number < num_points; ++number) {
+        uint32_t output1 = *inputPtr;
+        uint32_t output2 = inputPtr[1];
+
+        output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
+                   ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+        output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
+                   ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+        *inputPtr++ = output2;
+        *inputPtr++ = output1;
+    }
  }
  #endif /* LV_HAVE_NEON */
  #endif
@@ -336,49 +345,52 @@ static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num
  #ifdef LV_HAVE_SSE2
  #include <emmintrin.h>
  
-static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points){
+static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points)
+{
      uint32_t* inputPtr = (uint32_t*)intsToSwap;
      __m128i input, byte1, byte2, byte3, byte4, output;
      __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
      __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
      uint64_t number = 0;
      const unsigned int halfPoints = num_points / 2;
-    for(;number < halfPoints; number++){
-      // Load the 32t values, increment inputPtr later since we're doing it in-place.
-      input = _mm_load_si128((__m128i*)inputPtr);
-
-      // Do the four shifts
-      byte1 = _mm_slli_epi32(input, 24);
-      byte2 = _mm_slli_epi32(input, 8);
-      byte3 = _mm_srli_epi32(input, 8);
-      byte4 = _mm_srli_epi32(input, 24);
-      // Or bytes together
-      output = _mm_or_si128(byte1, byte4);
-      byte2 = _mm_and_si128(byte2, byte2mask);
-      output = _mm_or_si128(output, byte2);
-      byte3 = _mm_and_si128(byte3, byte3mask);
-      output = _mm_or_si128(output, byte3);
-
-      // Reorder the two words
-      output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
-
-      // Store the results
-      _mm_store_si128((__m128i*)inputPtr, output);
-      inputPtr += 4;
+    for (; number < halfPoints; number++) {
+        // Load the 32t values, increment inputPtr later since we're doing it in-place.
+        input = _mm_load_si128((__m128i*)inputPtr);
+
+        // Do the four shifts
+        byte1 = _mm_slli_epi32(input, 24);
+        byte2 = _mm_slli_epi32(input, 8);
+        byte3 = _mm_srli_epi32(input, 8);
+        byte4 = _mm_srli_epi32(input, 24);
+        // Or bytes together
+        output = _mm_or_si128(byte1, byte4);
+        byte2 = _mm_and_si128(byte2, byte2mask);
+        output = _mm_or_si128(output, byte2);
+        byte3 = _mm_and_si128(byte3, byte3mask);
+        output = _mm_or_si128(output, byte3);
+
+        // Reorder the two words
+        output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
+
+        // Store the results
+        _mm_store_si128((__m128i*)inputPtr, output);
+        inputPtr += 4;
      }
  
      // Byteswap any remaining points:
-    number = halfPoints*2;
-    for(; number < num_points; number++){
-      uint32_t output1 = *inputPtr;
-      uint32_t output2 = inputPtr[1];
+    number = halfPoints * 2;
+    for (; number < num_points; number++) {
+        uint32_t output1 = *inputPtr;
+        uint32_t output2 = inputPtr[1];
  
-      output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+        output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
+                   ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
  
-      output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+        output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
+                   ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
  
-      *inputPtr++ = output2;
-      *inputPtr++ = output1;
+        *inputPtr++ = output2;
+        *inputPtr++ = output1;
      }
  }
  #endif /* LV_HAVE_SSE2 */
@@ -387,46 +399,46 @@ static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int n
  #include <immintrin.h>
  static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap, unsigned int num_points)
  {
-  unsigned int number = 0;
-
-  const unsigned int nPerSet = 4;
-  const uint64_t     nSets   = num_points / nPerSet;
-
-  uint32_t* inputPtr = (uint32_t*)intsToSwap;
-
-  const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
-
-  const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]);
-
-  for ( ;number < nSets; number++ ) {
-    // Load the 32t values, increment inputPtr later since we're doing it in-place.
-    const __m256i input  = _mm256_loadu_si256((__m256i*)inputPtr);
-    const __m256i output = _mm256_shuffle_epi8(input,myShuffle);
-
-    // Store the results
-    _mm256_storeu_si256((__m256i*)inputPtr, output);
-
-    /*  inputPtr is 32bit so increment twice  */
-    inputPtr += 2 * nPerSet;
-  }
-  _mm256_zeroupper();
-
-  // Byteswap any remaining points:
-  for(number = nSets * nPerSet; number < num_points; ++number ) {
-    uint32_t output1 = *inputPtr;
-    uint32_t output2 = inputPtr[1];
-    uint32_t out1 = ((((output1) >> 24) & 0x000000ff) |
-                    (((output1) >>  8) & 0x0000ff00) |
-                    (((output1) <<  8) & 0x00ff0000) |
-                    (((output1) << 24) & 0xff000000)   );
-
-    uint32_t out2 = ((((output2) >> 24) & 0x000000ff) |
-                    (((output2) >>  8) & 0x0000ff00) |
-                    (((output2) <<  8) & 0x00ff0000) |
-                    (((output2) << 24) & 0xff000000)   );
-    *inputPtr++ = out2;
-    *inputPtr++ = out1;
-  }
+    unsigned int number = 0;
+
+    const unsigned int nPerSet = 4;
+    const uint64_t nSets = num_points / nPerSet;
+
+    uint32_t* inputPtr = (uint32_t*)intsToSwap;
+
+    const uint8_t shuffleVector[32] = { 7,  6,  5,  4,  3,  2,  1,  0,  15, 14, 13,
+                                        12, 11, 10, 9,  8,  23, 22, 21, 20, 19, 18,
+                                        17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
+
+    const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
+
+    for (; number < nSets; number++) {
+        // Load the 32t values, increment inputPtr later since we're doing it in-place.
+        const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
+        const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
+
+        // Store the results
+        _mm256_storeu_si256((__m256i*)inputPtr, output);
+
+        /*  inputPtr is 32bit so increment twice  */
+        inputPtr += 2 * nPerSet;
+    }
+    _mm256_zeroupper();
+
+    // Byteswap any remaining points:
+    for (number = nSets * nPerSet; number < num_points; ++number) {
+        uint32_t output1 = *inputPtr;
+        uint32_t output2 = inputPtr[1];
+        uint32_t out1 =
+            ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
+             (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
+
+        uint32_t out2 =
+            ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
+             (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
+        *inputPtr++ = out2;
+        *inputPtr++ = out1;
+    }
  }
  
  #endif /* LV_HAVE_AVX2 */
@@ -434,70 +446,71 @@ static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap, unsigned int n
  
  #if LV_HAVE_SSSE3
  #include <tmmintrin.h>
-static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap, unsigned int num_points)
+static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap,
+                                             unsigned int num_points)
  {
-  unsigned int number = 0;
-
-  const unsigned int nPerSet = 2;
-  const uint64_t     nSets   = num_points / nPerSet;
+    unsigned int number = 0;
  
-  uint32_t* inputPtr = (uint32_t*)intsToSwap;
+    const unsigned int nPerSet = 2;
+    const uint64_t nSets = num_points / nPerSet;
  
-  uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+    uint32_t* inputPtr = (uint32_t*)intsToSwap;
  
-  const __m128i myShuffle = _mm_loadu_si128((__m128i*) &shuffleVector);
+    uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
  
-  for ( ;number < nSets; number++ ) {
-    // Load the 32t values, increment inputPtr later since we're doing it in-place.
-    const __m128i input  = _mm_loadu_si128((__m128i*)inputPtr);
-    const __m128i output = _mm_shuffle_epi8(input,myShuffle);
+    const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
  
-    // Store the results
-    _mm_storeu_si128((__m128i*)inputPtr, output);
+    for (; number < nSets; number++) {
+        // Load the 32t values, increment inputPtr later since we're doing it in-place.
+        const __m128i input = _mm_loadu_si128((__m128i*)inputPtr);
+        const __m128i output = _mm_shuffle_epi8(input, myShuffle);
  
-    /*  inputPtr is 32bit so increment twice  */
-    inputPtr += 2 * nPerSet;
-  }
+        // Store the results
+        _mm_storeu_si128((__m128i*)inputPtr, output);
  
-  // Byteswap any remaining points:
-  for(number = nSets * nPerSet; number < num_points; ++number ) {
-    uint32_t output1 = *inputPtr;
-    uint32_t output2 = inputPtr[1];
-    uint32_t out1 = ((((output1) >> 24) & 0x000000ff) |
-                    (((output1) >>  8) & 0x0000ff00) |
-                    (((output1) <<  8) & 0x00ff0000) |
-                    (((output1) << 24) & 0xff000000)   );
+        /*  inputPtr is 32bit so increment twice  */
+        inputPtr += 2 * nPerSet;
+    }
  
-    uint32_t out2 = ((((output2) >> 24) & 0x000000ff) |
-                    (((output2) >>  8) & 0x0000ff00) |
-                    (((output2) <<  8) & 0x00ff0000) |
-                    (((output2) << 24) & 0xff000000)   );
-    *inputPtr++ = out2;
-    *inputPtr++ = out1;
-  }
+    // Byteswap any remaining points:
+    for (number = nSets * nPerSet; number < num_points; ++number) {
+        uint32_t output1 = *inputPtr;
+        uint32_t output2 = inputPtr[1];
+        uint32_t out1 =
+            ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
+             (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
+
+        uint32_t out2 =
+            ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
+             (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
+        *inputPtr++ = out2;
+        *inputPtr++ = out1;
+    }
  }
  #endif /* LV_HAVE_SSSE3 */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap, unsigned int num_points){
-  uint32_t* inputPtr = (uint32_t*)intsToSwap;
-  unsigned int point;
-  for(point = 0; point < num_points; point++){
-    uint32_t output1 = *inputPtr;
-    uint32_t output2 = inputPtr[1];
+static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap,
+                                               unsigned int num_points)
+{
+    uint32_t* inputPtr = (uint32_t*)intsToSwap;
+    unsigned int point;
+    for (point = 0; point < num_points; point++) {
+        uint32_t output1 = *inputPtr;
+        uint32_t output2 = inputPtr[1];
  
-    output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+        output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
+                   ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
  
-    output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+        output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
+                   ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
  
-    *inputPtr++ = output2;
-    *inputPtr++ = output1;
-  }
+        *inputPtr++ = output2;
+        *inputPtr++ = output1;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
-
-
  #endif /* INCLUDED_volk_64u_byteswap_a_H */
diff --git a/kernels/volk/volk_64u_byteswappuppet_64u.h b/kernels/volk/volk_64u_byteswappuppet_64u.h

index 2db0171c8348d22a5fbe80056e2181c1ebac15ee..ded54ee1b2faee51f4b8244601b74631a4acb2a1 100644 (file)
--- a/kernels/volk/volk_64u_byteswappuppet_64u.h
+++ b/kernels/volk/volk_64u_byteswappuppet_64u.h
@@ -3,87 +3,105 @@
  
  
  #include <stdint.h>
-#include <volk/volk_64u_byteswap.h>
  #include <string.h>
+#include <volk/volk_64u_byteswap.h>
  
  #ifdef LV_HAVE_GENERIC
-static inline void volk_64u_byteswappuppet_64u_generic(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){
+static inline void volk_64u_byteswappuppet_64u_generic(uint64_t* output,
+                                                       uint64_t* intsToSwap,
+                                                       unsigned int num_points)
+{
  
      volk_64u_byteswap_generic((uint64_t*)intsToSwap, num_points);
      memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
-
  }
  #endif
  
  #ifdef LV_HAVE_NEONV8
-static inline void volk_64u_byteswappuppet_64u_neonv8(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){
+static inline void volk_64u_byteswappuppet_64u_neonv8(uint64_t* output,
+                                                      uint64_t* intsToSwap,
+                                                      unsigned int num_points)
+{
  
      volk_64u_byteswap_neonv8((uint64_t*)intsToSwap, num_points);
      memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
-
  }
  #else
  #ifdef LV_HAVE_NEON
-static inline void volk_64u_byteswappuppet_64u_neon(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){
+static inline void volk_64u_byteswappuppet_64u_neon(uint64_t* output,
+                                                    uint64_t* intsToSwap,
+                                                    unsigned int num_points)
+{
  
      volk_64u_byteswap_neon((uint64_t*)intsToSwap, num_points);
      memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
-
  }
  #endif
  #endif
  
  #ifdef LV_HAVE_SSE2
-static inline void volk_64u_byteswappuppet_64u_u_sse2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
+static inline void volk_64u_byteswappuppet_64u_u_sse2(uint64_t* output,
+                                                      uint64_t* intsToSwap,
+                                                      unsigned int num_points)
+{
  
      volk_64u_byteswap_u_sse2((uint64_t*)intsToSwap, num_points);
      memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
-
  }
  #endif
  
  #ifdef LV_HAVE_SSE2
-static inline void volk_64u_byteswappuppet_64u_a_sse2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
+static inline void volk_64u_byteswappuppet_64u_a_sse2(uint64_t* output,
+                                                      uint64_t* intsToSwap,
+                                                      unsigned int num_points)
+{
  
      volk_64u_byteswap_a_sse2((uint64_t*)intsToSwap, num_points);
      memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
-
  }
  #endif
  
  #ifdef LV_HAVE_SSSE3
-static inline void volk_64u_byteswappuppet_64u_u_ssse3(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
+static inline void volk_64u_byteswappuppet_64u_u_ssse3(uint64_t* output,
+                                                       uint64_t* intsToSwap,
+                                                       unsigned int num_points)
+{
  
      volk_64u_byteswap_u_ssse3((uint64_t*)intsToSwap, num_points);
      memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
-
  }
  #endif
  
  #ifdef LV_HAVE_SSSE3
-static inline void volk_64u_byteswappuppet_64u_a_ssse3(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
+static inline void volk_64u_byteswappuppet_64u_a_ssse3(uint64_t* output,
+                                                       uint64_t* intsToSwap,
+                                                       unsigned int num_points)
+{
  
      volk_64u_byteswap_a_ssse3((uint64_t*)intsToSwap, num_points);
      memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
-
  }
  #endif
  
  #ifdef LV_HAVE_AVX2
-static inline void volk_64u_byteswappuppet_64u_u_avx2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
+static inline void volk_64u_byteswappuppet_64u_u_avx2(uint64_t* output,
+                                                      uint64_t* intsToSwap,
+                                                      unsigned int num_points)
+{
  
      volk_64u_byteswap_u_avx2((uint64_t*)intsToSwap, num_points);
      memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
-
  }
  #endif
  
  #ifdef LV_HAVE_AVX2
-static inline void volk_64u_byteswappuppet_64u_a_avx2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
+static inline void volk_64u_byteswappuppet_64u_a_avx2(uint64_t* output,
+                                                      uint64_t* intsToSwap,
+                                                      unsigned int num_points)
+{
  
      volk_64u_byteswap_a_avx2((uint64_t*)intsToSwap, num_points);
      memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
-
  }
  #endif
  
diff --git a/kernels/volk/volk_64u_popcnt.h b/kernels/volk/volk_64u_popcnt.h

index cbce2ec7c7d62c92df8e7115c0ab84b65a7bdabd..43c2ae0770c872945d12d2e255cb474b2ab452d3 100644 (file)
--- a/kernels/volk/volk_64u_popcnt.h
+++ b/kernels/volk/volk_64u_popcnt.h
@@ -60,39 +60,38 @@
  #ifndef INCLUDED_volk_64u_popcnt_a_H
  #define INCLUDED_volk_64u_popcnt_a_H
  
-#include <stdio.h>
  #include <inttypes.h>
+#include <stdio.h>
  
  
  #ifdef LV_HAVE_GENERIC
  
  
-static inline void
-volk_64u_popcnt_generic(uint64_t* ret, const uint64_t value)
+static inline void volk_64u_popcnt_generic(uint64_t* ret, const uint64_t value)
  {
-  //const uint32_t* valueVector = (const uint32_t*)&value;
-
-  // This is faster than a lookup table
-  //uint32_t retVal = valueVector[0];
-  uint32_t retVal = (uint32_t)(value & 0x00000000FFFFFFFFull);
-
-  retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
-  retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
-  retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
-  retVal = (retVal + (retVal >> 8));
-  retVal = (retVal + (retVal >> 16)) & 0x0000003F;
-  uint64_t retVal64  = retVal;
-
-  //retVal = valueVector[1];
-  retVal = (uint32_t)((value & 0xFFFFFFFF00000000ull) >> 32);
-  retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
-  retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
-  retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
-  retVal = (retVal + (retVal >> 8));
-  retVal = (retVal + (retVal >> 16)) & 0x0000003F;
-  retVal64 += retVal;
-
-  *ret = retVal64;
+    // const uint32_t* valueVector = (const uint32_t*)&value;
+
+    // This is faster than a lookup table
+    // uint32_t retVal = valueVector[0];
+    uint32_t retVal = (uint32_t)(value & 0x00000000FFFFFFFFull);
+
+    retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+    retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+    retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+    retVal = (retVal + (retVal >> 8));
+    retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+    uint64_t retVal64 = retVal;
+
+    // retVal = valueVector[1];
+    retVal = (uint32_t)((value & 0xFFFFFFFF00000000ull) >> 32);
+    retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+    retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+    retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+    retVal = (retVal + (retVal >> 8));
+    retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+    retVal64 += retVal;
+
+    *ret = retVal64;
  }
  
  #endif /*LV_HAVE_GENERIC*/
@@ -104,7 +103,7 @@ volk_64u_popcnt_generic(uint64_t* ret, const uint64_t value)
  
  static inline void volk_64u_popcnt_a_sse4_2(uint64_t* ret, const uint64_t value)
  {
-  *ret = _mm_popcnt_u64(value);
+    *ret = _mm_popcnt_u64(value);
  }
  
  #endif /*LV_HAVE_SSE4_2*/
@@ -114,19 +113,19 @@ static inline void volk_64u_popcnt_a_sse4_2(uint64_t* ret, const uint64_t value)
  #include <arm_neon.h>
  static inline void volk_64u_popcnt_neon(uint64_t* ret, const uint64_t value)
  {
-  uint8x8_t input_val, count8x8_val;
-  uint16x4_t count16x4_val;
-  uint32x2_t count32x2_val;
-  uint64x1_t count64x1_val;
-
-  input_val = vld1_u8((unsigned char *) &value);
-  count8x8_val = vcnt_u8(input_val);
-  count16x4_val = vpaddl_u8(count8x8_val);
-  count32x2_val = vpaddl_u16(count16x4_val);
-  count64x1_val = vpaddl_u32(count32x2_val);
-  vst1_u64(ret, count64x1_val);
-
-  //*ret = _mm_popcnt_u64(value);
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+    uint64x1_t count64x1_val;
+
+    input_val = vld1_u8((unsigned char*)&value);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+    count64x1_val = vpaddl_u32(count32x2_val);
+    vst1_u64(ret, count64x1_val);
+
+    //*ret = _mm_popcnt_u64(value);
  }
  #endif /*LV_HAVE_NEON*/
  
diff --git a/kernels/volk/volk_64u_popcntpuppet_64u.h b/kernels/volk/volk_64u_popcntpuppet_64u.h

index e38ebb3204462facf48476448bf2570b5f051321..688281ad0165635e28313e428010e7dca923e84c 100644 (file)
--- a/kernels/volk/volk_64u_popcntpuppet_64u.h
+++ b/kernels/volk/volk_64u_popcntpuppet_64u.h
@@ -23,35 +23,44 @@
  #ifndef INCLUDED_volk_64u_popcntpuppet_64u_H
  #define INCLUDED_volk_64u_popcntpuppet_64u_H
  
-#include <volk/volk_64u_popcnt.h>
  #include <stdint.h>
  #include <string.h>
+#include <volk/volk_64u_popcnt.h>
  
  #ifdef LV_HAVE_GENERIC
-static inline void volk_64u_popcntpuppet_64u_generic(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){
+static inline void volk_64u_popcntpuppet_64u_generic(uint64_t* outVector,
+                                                     const uint64_t* inVector,
+                                                     unsigned int num_points)
+{
      unsigned int ii;
-    for(ii=0; ii < num_points; ++ii) {
-        volk_64u_popcnt_generic(outVector+ii, num_points );
+    for (ii = 0; ii < num_points; ++ii) {
+        volk_64u_popcnt_generic(outVector + ii, num_points);
      }
      memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t));
  }
  #endif /* LV_HAVE_GENERIC */
  
  #if LV_HAVE_SSE4_2 && LV_HAVE_64
-static inline void volk_64u_popcntpuppet_64u_a_sse4_2(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){
+static inline void volk_64u_popcntpuppet_64u_a_sse4_2(uint64_t* outVector,
+                                                      const uint64_t* inVector,
+                                                      unsigned int num_points)
+{
      unsigned int ii;
-    for(ii=0; ii < num_points; ++ii) {
-        volk_64u_popcnt_a_sse4_2(outVector+ii, num_points );
+    for (ii = 0; ii < num_points; ++ii) {
+        volk_64u_popcnt_a_sse4_2(outVector + ii, num_points);
      }
      memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t));
  }
  #endif /* LV_HAVE_SSE4_2 */
  
  #ifdef LV_HAVE_NEON
-static inline void volk_64u_popcntpuppet_64u_neon(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){
+static inline void volk_64u_popcntpuppet_64u_neon(uint64_t* outVector,
+                                                  const uint64_t* inVector,
+                                                  unsigned int num_points)
+{
      unsigned int ii;
-    for(ii=0; ii < num_points; ++ii) {
-        volk_64u_popcnt_neon(outVector+ii, num_points );
+    for (ii = 0; ii < num_points; ++ii) {
+        volk_64u_popcnt_neon(outVector + ii, num_points);
      }
      memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t));
  }
diff --git a/kernels/volk/volk_8i_convert_16i.h b/kernels/volk/volk_8i_convert_16i.h

index 40400c3c770d910131792c7ccf954f63ccd6ad7f..69d8f6ae9da90b9df9f35167dd9df640a66908bd 100644 (file)
--- a/kernels/volk/volk_8i_convert_16i.h
+++ b/kernels/volk/volk_8i_convert_16i.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_8i_convert_16i(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points)
- * \endcode
+ * void volk_8i_convert_16i(int16_t* outputVector, const int8_t* inputVector, unsigned int
+ * num_points) \endcode
   *
   * \b Inputs
   * \li inputVector: The input vector of 8-bit chars.
@@ -59,32 +59,32 @@
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_8i_convert_16i_u_avx2(int16_t* outputVector, const int8_t* inputVector,
-                             unsigned int num_points)
+static inline void volk_8i_convert_16i_u_avx2(int16_t* outputVector,
+                                              const int8_t* inputVector,
+                                              unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  const __m128i* inputVectorPtr = (const __m128i*)inputVector;
-  __m256i* outputVectorPtr = (__m256i*)outputVector;
-  __m128i inputVal;
-  __m256i ret;
-
-  for(;number < sixteenthPoints; number++){
-    inputVal = _mm_loadu_si128(inputVectorPtr);
-    ret = _mm256_cvtepi8_epi16(inputVal);
-    ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
-    _mm256_storeu_si256(outputVectorPtr, ret);
-
-    outputVectorPtr++;
-    inputVectorPtr++;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    outputVector[number] = (int16_t)(inputVector[number])*256;
-  }
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+    __m256i* outputVectorPtr = (__m256i*)outputVector;
+    __m128i inputVal;
+    __m256i ret;
+
+    for (; number < sixteenthPoints; number++) {
+        inputVal = _mm_loadu_si128(inputVectorPtr);
+        ret = _mm256_cvtepi8_epi16(inputVal);
+        ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
+        _mm256_storeu_si256(outputVectorPtr, ret);
+
+        outputVectorPtr++;
+        inputVectorPtr++;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = (int16_t)(inputVector[number]) * 256;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -92,57 +92,57 @@ volk_8i_convert_16i_u_avx2(int16_t* outputVector, const int8_t* inputVector,
  #ifdef LV_HAVE_SSE4_1
  #include <smmintrin.h>
  
-static inline void
-volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, const int8_t* inputVector,
-                             unsigned int num_points)
+static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector,
+                                                const int8_t* inputVector,
+                                                unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-  const __m128i* inputVectorPtr = (const __m128i*)inputVector;
-  __m128i* outputVectorPtr = (__m128i*)outputVector;
-  __m128i inputVal;
-  __m128i ret;
+    const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+    __m128i* outputVectorPtr = (__m128i*)outputVector;
+    __m128i inputVal;
+    __m128i ret;
  
-  for(;number < sixteenthPoints; number++){
-    inputVal = _mm_loadu_si128(inputVectorPtr);
-    ret = _mm_cvtepi8_epi16(inputVal);
-    ret = _mm_slli_epi16(ret, 8); // Multiply by 256
-    _mm_storeu_si128(outputVectorPtr, ret);
+    for (; number < sixteenthPoints; number++) {
+        inputVal = _mm_loadu_si128(inputVectorPtr);
+        ret = _mm_cvtepi8_epi16(inputVal);
+        ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+        _mm_storeu_si128(outputVectorPtr, ret);
  
-    outputVectorPtr++;
+        outputVectorPtr++;
  
-    inputVal = _mm_srli_si128(inputVal, 8);
-    ret = _mm_cvtepi8_epi16(inputVal);
-    ret = _mm_slli_epi16(ret, 8); // Multiply by 256
-    _mm_storeu_si128(outputVectorPtr, ret);
+        inputVal = _mm_srli_si128(inputVal, 8);
+        ret = _mm_cvtepi8_epi16(inputVal);
+        ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+        _mm_storeu_si128(outputVectorPtr, ret);
  
-    outputVectorPtr++;
+        outputVectorPtr++;
  
-    inputVectorPtr++;
-  }
+        inputVectorPtr++;
+    }
  
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    outputVector[number] = (int16_t)(inputVector[number])*256;
-  }
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = (int16_t)(inputVector[number]) * 256;
+    }
  }
  #endif /* LV_HAVE_SSE4_1 */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_8i_convert_16i_generic(int16_t* outputVector, const int8_t* inputVector,
-                            unsigned int num_points)
+static inline void volk_8i_convert_16i_generic(int16_t* outputVector,
+                                               const int8_t* inputVector,
+                                               unsigned int num_points)
  {
-  int16_t* outputVectorPtr = outputVector;
-  const int8_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
+    int16_t* outputVectorPtr = outputVector;
+    const int8_t* inputVectorPtr = inputVector;
+    unsigned int number = 0;
  
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
-  }
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -150,7 +150,6 @@ volk_8i_convert_16i_generic(int16_t* outputVector, const int8_t* inputVector,
  #endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
  
  
-
  #ifndef INCLUDED_volk_8i_convert_16i_a_H
  #define INCLUDED_volk_8i_convert_16i_a_H
  
@@ -160,32 +159,32 @@ volk_8i_convert_16i_generic(int16_t* outputVector, const int8_t* inputVector,
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_8i_convert_16i_a_avx2(int16_t* outputVector, const int8_t* inputVector,
-                             unsigned int num_points)
+static inline void volk_8i_convert_16i_a_avx2(int16_t* outputVector,
+                                              const int8_t* inputVector,
+                                              unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  const __m128i* inputVectorPtr = (const __m128i*)inputVector;
-  __m256i* outputVectorPtr = (__m256i*)outputVector;
-  __m128i inputVal;
-  __m256i ret;
-
-  for(;number < sixteenthPoints; number++){
-    inputVal = _mm_load_si128(inputVectorPtr);
-    ret = _mm256_cvtepi8_epi16(inputVal);
-    ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
-    _mm256_store_si256(outputVectorPtr, ret);
-
-    outputVectorPtr++;
-    inputVectorPtr++;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    outputVector[number] = (int16_t)(inputVector[number])*256;
-  }
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+    __m256i* outputVectorPtr = (__m256i*)outputVector;
+    __m128i inputVal;
+    __m256i ret;
+
+    for (; number < sixteenthPoints; number++) {
+        inputVal = _mm_load_si128(inputVectorPtr);
+        ret = _mm256_cvtepi8_epi16(inputVal);
+        ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
+        _mm256_store_si256(outputVectorPtr, ret);
+
+        outputVectorPtr++;
+        inputVectorPtr++;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = (int16_t)(inputVector[number]) * 256;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -193,57 +192,57 @@ volk_8i_convert_16i_a_avx2(int16_t* outputVector, const int8_t* inputVector,
  #ifdef LV_HAVE_SSE4_1
  #include <smmintrin.h>
  
-static inline void
-volk_8i_convert_16i_a_sse4_1(int16_t* outputVector, const int8_t* inputVector,
-                             unsigned int num_points)
+static inline void volk_8i_convert_16i_a_sse4_1(int16_t* outputVector,
+                                                const int8_t* inputVector,
+                                                unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
  
-  const __m128i* inputVectorPtr = (const __m128i*)inputVector;
-  __m128i* outputVectorPtr = (__m128i*)outputVector;
-  __m128i inputVal;
-  __m128i ret;
+    const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+    __m128i* outputVectorPtr = (__m128i*)outputVector;
+    __m128i inputVal;
+    __m128i ret;
  
-  for(;number < sixteenthPoints; number++){
-    inputVal = _mm_load_si128(inputVectorPtr);
-    ret = _mm_cvtepi8_epi16(inputVal);
-    ret = _mm_slli_epi16(ret, 8); // Multiply by 256
-    _mm_store_si128(outputVectorPtr, ret);
+    for (; number < sixteenthPoints; number++) {
+        inputVal = _mm_load_si128(inputVectorPtr);
+        ret = _mm_cvtepi8_epi16(inputVal);
+        ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+        _mm_store_si128(outputVectorPtr, ret);
  
-    outputVectorPtr++;
+        outputVectorPtr++;
  
-    inputVal = _mm_srli_si128(inputVal, 8);
-    ret = _mm_cvtepi8_epi16(inputVal);
-    ret = _mm_slli_epi16(ret, 8); // Multiply by 256
-    _mm_store_si128(outputVectorPtr, ret);
+        inputVal = _mm_srli_si128(inputVal, 8);
+        ret = _mm_cvtepi8_epi16(inputVal);
+        ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+        _mm_store_si128(outputVectorPtr, ret);
  
-    outputVectorPtr++;
+        outputVectorPtr++;
  
-    inputVectorPtr++;
-  }
+        inputVectorPtr++;
+    }
  
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    outputVector[number] = (int16_t)(inputVector[number])*256;
-  }
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = (int16_t)(inputVector[number]) * 256;
+    }
  }
  #endif /* LV_HAVE_SSE4_1 */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_8i_convert_16i_a_generic(int16_t* outputVector, const int8_t* inputVector,
-                              unsigned int num_points)
+static inline void volk_8i_convert_16i_a_generic(int16_t* outputVector,
+                                                 const int8_t* inputVector,
+                                                 unsigned int num_points)
  {
-  int16_t* outputVectorPtr = outputVector;
-  const int8_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
+    int16_t* outputVectorPtr = outputVector;
+    const int8_t* inputVectorPtr = inputVector;
+    unsigned int number = 0;
  
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
-  }
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -251,51 +250,51 @@ volk_8i_convert_16i_a_generic(int16_t* outputVector, const int8_t* inputVector,
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_8i_convert_16i_neon(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points)
+static inline void volk_8i_convert_16i_neon(int16_t* outputVector,
+                                            const int8_t* inputVector,
+                                            unsigned int num_points)
  {
-  int16_t* outputVectorPtr = outputVector;
-  const int8_t* inputVectorPtr = inputVector;
-  unsigned int number;
-  const unsigned int eighth_points = num_points / 8;
-
-  int8x8_t input_vec ;
-  int16x8_t converted_vec;
-
-  // NEON doesn't have a concept of 8 bit registers, so we are really
-  // dealing with the low half of 16-bit registers. Since this requires
-  // a move instruction we likely do better with ASM here.
-  for(number = 0; number < eighth_points; ++number) {
-    input_vec = vld1_s8(inputVectorPtr);
-    converted_vec = vmovl_s8(input_vec);
-    //converted_vec = vmulq_s16(converted_vec, scale_factor);
-    converted_vec = vshlq_n_s16(converted_vec, 8);
-    vst1q_s16( outputVectorPtr, converted_vec);
-
-    inputVectorPtr += 8;
-    outputVectorPtr += 8;
-  }
-
-  for(number = eighth_points * 8; number < num_points; number++){
-    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
-  }
+    int16_t* outputVectorPtr = outputVector;
+    const int8_t* inputVectorPtr = inputVector;
+    unsigned int number;
+    const unsigned int eighth_points = num_points / 8;
+
+    int8x8_t input_vec;
+    int16x8_t converted_vec;
+
+    // NEON doesn't have a concept of 8 bit registers, so we are really
+    // dealing with the low half of 16-bit registers. Since this requires
+    // a move instruction we likely do better with ASM here.
+    for (number = 0; number < eighth_points; ++number) {
+        input_vec = vld1_s8(inputVectorPtr);
+        converted_vec = vmovl_s8(input_vec);
+        // converted_vec = vmulq_s16(converted_vec, scale_factor);
+        converted_vec = vshlq_n_s16(converted_vec, 8);
+        vst1q_s16(outputVectorPtr, converted_vec);
+
+        inputVectorPtr += 8;
+        outputVectorPtr += 8;
+    }
+
+    for (number = eighth_points * 8; number < num_points; number++) {
+        *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+    }
  }
  #endif /* LV_HAVE_NEON */
  
  
  #ifdef LV_HAVE_ORC
-extern void
-volk_8i_convert_16i_a_orc_impl(int16_t* outputVector, const int8_t* inputVector,
-                               unsigned int num_points);
+extern void volk_8i_convert_16i_a_orc_impl(int16_t* outputVector,
+                                           const int8_t* inputVector,
+                                           unsigned int num_points);
  
-static inline void
-volk_8i_convert_16i_u_orc(int16_t* outputVector, const int8_t* inputVector,
-                          unsigned int num_points)
+static inline void volk_8i_convert_16i_u_orc(int16_t* outputVector,
+                                             const int8_t* inputVector,
+                                             unsigned int num_points)
  {
-  volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points);
+    volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points);
  }
  #endif /* LV_HAVE_ORC */
  
  
-
  #endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */
diff --git a/kernels/volk/volk_8i_s32f_convert_32f.h b/kernels/volk/volk_8i_s32f_convert_32f.h

index 97d160bd73a0f236184645cea6f6a7c17f53517c..c3d56660cb6e7955c58ad9b9874927ff2884250f 100644 (file)
--- a/kernels/volk/volk_8i_s32f_convert_32f.h
+++ b/kernels/volk/volk_8i_s32f_convert_32f.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_8i_s32f_convert_32f(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points)
- * \endcode
+ * void volk_8i_s32f_convert_32f(float* outputVector, const int8_t* inputVector, const
+ * float scalar, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li inputVector: The input vector of 8-bit chars.
@@ -60,44 +60,45 @@
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_8i_s32f_convert_32f_u_avx2(float* outputVector, const int8_t* inputVector,
-                                  const float scalar, unsigned int num_points)
+static inline void volk_8i_s32f_convert_32f_u_avx2(float* outputVector,
+                                                   const int8_t* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float* outputVectorPtr = outputVector;
-  const float iScalar = 1.0 / scalar;
-  __m256 invScalar = _mm256_set1_ps( iScalar );
-  const int8_t* inputVectorPtr = inputVector;
-  __m256 ret;
-  __m128i inputVal128;
-  __m256i interimVal;
-
-  for(;number < sixteenthPoints; number++){
-    inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
-
-    interimVal = _mm256_cvtepi8_epi32(inputVal128);
-    ret = _mm256_cvtepi32_ps(interimVal);
-    ret = _mm256_mul_ps(ret, invScalar);
-    _mm256_storeu_ps(outputVectorPtr, ret);
-    outputVectorPtr += 8;
-
-    inputVal128 = _mm_srli_si128(inputVal128, 8);
-    interimVal = _mm256_cvtepi8_epi32(inputVal128);
-    ret = _mm256_cvtepi32_ps(interimVal);
-    ret = _mm256_mul_ps(ret, invScalar);
-    _mm256_storeu_ps(outputVectorPtr, ret);
-    outputVectorPtr += 8;
-
-    inputVectorPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    outputVector[number] = (float)(inputVector[number]) * iScalar;
-  }
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m256 invScalar = _mm256_set1_ps(iScalar);
+    const int8_t* inputVectorPtr = inputVector;
+    __m256 ret;
+    __m128i inputVal128;
+    __m256i interimVal;
+
+    for (; number < sixteenthPoints; number++) {
+        inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
+
+        interimVal = _mm256_cvtepi8_epi32(inputVal128);
+        ret = _mm256_cvtepi32_ps(interimVal);
+        ret = _mm256_mul_ps(ret, invScalar);
+        _mm256_storeu_ps(outputVectorPtr, ret);
+        outputVectorPtr += 8;
+
+        inputVal128 = _mm_srli_si128(inputVal128, 8);
+        interimVal = _mm256_cvtepi8_epi32(inputVal128);
+        ret = _mm256_cvtepi32_ps(interimVal);
+        ret = _mm256_mul_ps(ret, invScalar);
+        _mm256_storeu_ps(outputVectorPtr, ret);
+        outputVectorPtr += 8;
+
+        inputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = (float)(inputVector[number]) * iScalar;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -105,80 +106,81 @@ volk_8i_s32f_convert_32f_u_avx2(float* outputVector, const int8_t* inputVector,
  #ifdef LV_HAVE_SSE4_1
  #include <smmintrin.h>
  
-static inline void
-volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const int8_t* inputVector,
-                                  const float scalar, unsigned int num_points)
+static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector,
+                                                     const int8_t* inputVector,
+                                                     const float scalar,
+                                                     unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float* outputVectorPtr = outputVector;
-  const float iScalar = 1.0 / scalar;
-  __m128 invScalar = _mm_set_ps1( iScalar );
-  const int8_t* inputVectorPtr = inputVector;
-  __m128 ret;
-  __m128i inputVal;
-  __m128i interimVal;
-
-  for(;number < sixteenthPoints; number++){
-    inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
-
-    interimVal = _mm_cvtepi8_epi32(inputVal);
-    ret = _mm_cvtepi32_ps(interimVal);
-    ret = _mm_mul_ps(ret, invScalar);
-    _mm_storeu_ps(outputVectorPtr, ret);
-    outputVectorPtr += 4;
-
-    inputVal = _mm_srli_si128(inputVal, 4);
-    interimVal = _mm_cvtepi8_epi32(inputVal);
-    ret = _mm_cvtepi32_ps(interimVal);
-    ret = _mm_mul_ps(ret, invScalar);
-    _mm_storeu_ps(outputVectorPtr, ret);
-    outputVectorPtr += 4;
-
-    inputVal = _mm_srli_si128(inputVal, 4);
-    interimVal = _mm_cvtepi8_epi32(inputVal);
-    ret = _mm_cvtepi32_ps(interimVal);
-    ret = _mm_mul_ps(ret, invScalar);
-    _mm_storeu_ps(outputVectorPtr, ret);
-    outputVectorPtr += 4;
-
-    inputVal = _mm_srli_si128(inputVal, 4);
-    interimVal = _mm_cvtepi8_epi32(inputVal);
-    ret = _mm_cvtepi32_ps(interimVal);
-    ret = _mm_mul_ps(ret, invScalar);
-    _mm_storeu_ps(outputVectorPtr, ret);
-    outputVectorPtr += 4;
-
-    inputVectorPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    outputVector[number] = (float)(inputVector[number]) * iScalar;
-  }
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    const int8_t* inputVectorPtr = inputVector;
+    __m128 ret;
+    __m128i inputVal;
+    __m128i interimVal;
+
+    for (; number < sixteenthPoints; number++) {
+        inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
+
+        interimVal = _mm_cvtepi8_epi32(inputVal);
+        ret = _mm_cvtepi32_ps(interimVal);
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_storeu_ps(outputVectorPtr, ret);
+        outputVectorPtr += 4;
+
+        inputVal = _mm_srli_si128(inputVal, 4);
+        interimVal = _mm_cvtepi8_epi32(inputVal);
+        ret = _mm_cvtepi32_ps(interimVal);
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_storeu_ps(outputVectorPtr, ret);
+        outputVectorPtr += 4;
+
+        inputVal = _mm_srli_si128(inputVal, 4);
+        interimVal = _mm_cvtepi8_epi32(inputVal);
+        ret = _mm_cvtepi32_ps(interimVal);
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_storeu_ps(outputVectorPtr, ret);
+        outputVectorPtr += 4;
+
+        inputVal = _mm_srli_si128(inputVal, 4);
+        interimVal = _mm_cvtepi8_epi32(inputVal);
+        ret = _mm_cvtepi32_ps(interimVal);
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_storeu_ps(outputVectorPtr, ret);
+        outputVectorPtr += 4;
+
+        inputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = (float)(inputVector[number]) * iScalar;
+    }
  }
  #endif /* LV_HAVE_SSE4_1 */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_8i_s32f_convert_32f_generic(float* outputVector, const int8_t* inputVector,
-                                 const float scalar, unsigned int num_points)
+static inline void volk_8i_s32f_convert_32f_generic(float* outputVector,
+                                                    const int8_t* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
  {
-  float* outputVectorPtr = outputVector;
-  const int8_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  const float iScalar = 1.0 / scalar;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
-  }
+    float* outputVectorPtr = outputVector;
+    const int8_t* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+    const float iScalar = 1.0 / scalar;
+
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
-
  #endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
  
  #ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
@@ -190,195 +192,199 @@ volk_8i_s32f_convert_32f_generic(float* outputVector, const int8_t* inputVector,
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_8i_s32f_convert_32f_a_avx2(float* outputVector, const int8_t* inputVector,
-                                  const float scalar, unsigned int num_points)
+static inline void volk_8i_s32f_convert_32f_a_avx2(float* outputVector,
+                                                   const int8_t* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float* outputVectorPtr = outputVector;
-  const float iScalar = 1.0 / scalar;
-  __m256 invScalar = _mm256_set1_ps( iScalar );
-  const int8_t* inputVectorPtr = inputVector;
-  __m256 ret;
-  __m128i inputVal128;
-  __m256i interimVal;
-
-  for(;number < sixteenthPoints; number++){
-    inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
-
-    interimVal = _mm256_cvtepi8_epi32(inputVal128);
-    ret = _mm256_cvtepi32_ps(interimVal);
-    ret = _mm256_mul_ps(ret, invScalar);
-    _mm256_store_ps(outputVectorPtr, ret);
-    outputVectorPtr += 8;
-
-    inputVal128 = _mm_srli_si128(inputVal128, 8);
-    interimVal = _mm256_cvtepi8_epi32(inputVal128);
-    ret = _mm256_cvtepi32_ps(interimVal);
-    ret = _mm256_mul_ps(ret, invScalar);
-    _mm256_store_ps(outputVectorPtr, ret);
-    outputVectorPtr += 8;
-
-    inputVectorPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    outputVector[number] = (float)(inputVector[number]) * iScalar;
-  }
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m256 invScalar = _mm256_set1_ps(iScalar);
+    const int8_t* inputVectorPtr = inputVector;
+    __m256 ret;
+    __m128i inputVal128;
+    __m256i interimVal;
+
+    for (; number < sixteenthPoints; number++) {
+        inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
+
+        interimVal = _mm256_cvtepi8_epi32(inputVal128);
+        ret = _mm256_cvtepi32_ps(interimVal);
+        ret = _mm256_mul_ps(ret, invScalar);
+        _mm256_store_ps(outputVectorPtr, ret);
+        outputVectorPtr += 8;
+
+        inputVal128 = _mm_srli_si128(inputVal128, 8);
+        interimVal = _mm256_cvtepi8_epi32(inputVal128);
+        ret = _mm256_cvtepi32_ps(interimVal);
+        ret = _mm256_mul_ps(ret, invScalar);
+        _mm256_store_ps(outputVectorPtr, ret);
+        outputVectorPtr += 8;
+
+        inputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = (float)(inputVector[number]) * iScalar;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
  #ifdef LV_HAVE_SSE4_1
  #include <smmintrin.h>
  
-static inline void
-volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector, const int8_t* inputVector,
-                                  const float scalar, unsigned int num_points)
+static inline void volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector,
+                                                     const int8_t* inputVector,
+                                                     const float scalar,
+                                                     unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float* outputVectorPtr = outputVector;
-  const float iScalar = 1.0 / scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-  const int8_t* inputVectorPtr = inputVector;
-  __m128 ret;
-  __m128i inputVal;
-  __m128i interimVal;
-
-  for(;number < sixteenthPoints; number++){
-    inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
-
-    interimVal = _mm_cvtepi8_epi32(inputVal);
-    ret = _mm_cvtepi32_ps(interimVal);
-    ret = _mm_mul_ps(ret, invScalar);
-    _mm_store_ps(outputVectorPtr, ret);
-    outputVectorPtr += 4;
-
-    inputVal = _mm_srli_si128(inputVal, 4);
-    interimVal = _mm_cvtepi8_epi32(inputVal);
-    ret = _mm_cvtepi32_ps(interimVal);
-    ret = _mm_mul_ps(ret, invScalar);
-    _mm_store_ps(outputVectorPtr, ret);
-    outputVectorPtr += 4;
-
-    inputVal = _mm_srli_si128(inputVal, 4);
-    interimVal = _mm_cvtepi8_epi32(inputVal);
-    ret = _mm_cvtepi32_ps(interimVal);
-    ret = _mm_mul_ps(ret, invScalar);
-    _mm_store_ps(outputVectorPtr, ret);
-    outputVectorPtr += 4;
-
-    inputVal = _mm_srli_si128(inputVal, 4);
-    interimVal = _mm_cvtepi8_epi32(inputVal);
-    ret = _mm_cvtepi32_ps(interimVal);
-    ret = _mm_mul_ps(ret, invScalar);
-    _mm_store_ps(outputVectorPtr, ret);
-    outputVectorPtr += 4;
-
-    inputVectorPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    outputVector[number] = (float)(inputVector[number]) * iScalar;
-  }
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    const int8_t* inputVectorPtr = inputVector;
+    __m128 ret;
+    __m128i inputVal;
+    __m128i interimVal;
+
+    for (; number < sixteenthPoints; number++) {
+        inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
+
+        interimVal = _mm_cvtepi8_epi32(inputVal);
+        ret = _mm_cvtepi32_ps(interimVal);
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_store_ps(outputVectorPtr, ret);
+        outputVectorPtr += 4;
+
+        inputVal = _mm_srli_si128(inputVal, 4);
+        interimVal = _mm_cvtepi8_epi32(inputVal);
+        ret = _mm_cvtepi32_ps(interimVal);
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_store_ps(outputVectorPtr, ret);
+        outputVectorPtr += 4;
+
+        inputVal = _mm_srli_si128(inputVal, 4);
+        interimVal = _mm_cvtepi8_epi32(inputVal);
+        ret = _mm_cvtepi32_ps(interimVal);
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_store_ps(outputVectorPtr, ret);
+        outputVectorPtr += 4;
+
+        inputVal = _mm_srli_si128(inputVal, 4);
+        interimVal = _mm_cvtepi8_epi32(inputVal);
+        ret = _mm_cvtepi32_ps(interimVal);
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_store_ps(outputVectorPtr, ret);
+        outputVectorPtr += 4;
+
+        inputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = (float)(inputVector[number]) * iScalar;
+    }
  }
  #endif /* LV_HAVE_SSE4_1 */
  
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_8i_s32f_convert_32f_neon(float* outputVector, const int8_t* inputVector,
-                              const float scalar, unsigned int num_points)
+static inline void volk_8i_s32f_convert_32f_neon(float* outputVector,
+                                                 const int8_t* inputVector,
+                                                 const float scalar,
+                                                 unsigned int num_points)
  {
-  float* outputVectorPtr = outputVector;
-  const int8_t* inputVectorPtr = inputVector;
-
-  const float iScalar = 1.0 / scalar;
-  const float32x4_t qiScalar = vdupq_n_f32(iScalar);
-
-  int8x8x2_t inputVal;
-  float32x4x2_t outputFloat;
-  int16x8_t tmp;
-  
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-  for(;number < sixteenthPoints; number++){
-      __VOLK_PREFETCH(inputVectorPtr+16);
-
-         inputVal = vld2_s8(inputVectorPtr);
-         inputVal = vzip_s8(inputVal.val[0], inputVal.val[1]);
-         inputVectorPtr += 16;
-
-      tmp = vmovl_s8(inputVal.val[0]);
-
-      outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp)));
-      outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar);
-      vst1q_f32(outputVectorPtr, outputFloat.val[0]);
-      outputVectorPtr += 4;
-
-      outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp)));
-      outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar);
-      vst1q_f32(outputVectorPtr, outputFloat.val[1]);
-      outputVectorPtr += 4;
-
-      tmp = vmovl_s8(inputVal.val[1]);
-
-      outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp)));
-      outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar);
-      vst1q_f32(outputVectorPtr, outputFloat.val[0]);
-      outputVectorPtr += 4;
-
-      outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp)));
-      outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar);
-      vst1q_f32(outputVectorPtr, outputFloat.val[1]);
-      outputVectorPtr += 4;
-  }
-  for(number = sixteenthPoints * 16; number < num_points; number++){
-      *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
-  }
+    float* outputVectorPtr = outputVector;
+    const int8_t* inputVectorPtr = inputVector;
+
+    const float iScalar = 1.0 / scalar;
+    const float32x4_t qiScalar = vdupq_n_f32(iScalar);
+
+    int8x8x2_t inputVal;
+    float32x4x2_t outputFloat;
+    int16x8_t tmp;
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    for (; number < sixteenthPoints; number++) {
+        __VOLK_PREFETCH(inputVectorPtr + 16);
+
+        inputVal = vld2_s8(inputVectorPtr);
+        inputVal = vzip_s8(inputVal.val[0], inputVal.val[1]);
+        inputVectorPtr += 16;
+
+        tmp = vmovl_s8(inputVal.val[0]);
+
+        outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp)));
+        outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar);
+        vst1q_f32(outputVectorPtr, outputFloat.val[0]);
+        outputVectorPtr += 4;
+
+        outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp)));
+        outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar);
+        vst1q_f32(outputVectorPtr, outputFloat.val[1]);
+        outputVectorPtr += 4;
+
+        tmp = vmovl_s8(inputVal.val[1]);
+
+        outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp)));
+        outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar);
+        vst1q_f32(outputVectorPtr, outputFloat.val[0]);
+        outputVectorPtr += 4;
+
+        outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp)));
+        outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar);
+        vst1q_f32(outputVectorPtr, outputFloat.val[1]);
+        outputVectorPtr += 4;
+    }
+    for (number = sixteenthPoints * 16; number < num_points; number++) {
+        *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+    }
  }
  
  #endif /* LV_HAVE_NEON */
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_8i_s32f_convert_32f_a_generic(float* outputVector, const int8_t* inputVector,
-                                   const float scalar, unsigned int num_points)
+static inline void volk_8i_s32f_convert_32f_a_generic(float* outputVector,
+                                                      const int8_t* inputVector,
+                                                      const float scalar,
+                                                      unsigned int num_points)
  {
-  float* outputVectorPtr = outputVector;
-  const int8_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  const float iScalar = 1.0 / scalar;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
-  }
+    float* outputVectorPtr = outputVector;
+    const int8_t* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+    const float iScalar = 1.0 / scalar;
+
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
  #ifdef LV_HAVE_ORC
-extern void
-volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector, const int8_t* inputVector,
-                                    const float scalar, unsigned int num_points);
-
-static inline void
-volk_8i_s32f_convert_32f_u_orc(float* outputVector, const int8_t* inputVector,
-                               const float scalar, unsigned int num_points)
+extern void volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector,
+                                                const int8_t* inputVector,
+                                                const float scalar,
+                                                unsigned int num_points);
+
+static inline void volk_8i_s32f_convert_32f_u_orc(float* outputVector,
+                                                  const int8_t* inputVector,
+                                                  const float scalar,
+                                                  unsigned int num_points)
  {
-  float invscalar = 1.0 / scalar;
-  volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
+    float invscalar = 1.0 / scalar;
+    volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
  }
  #endif /* LV_HAVE_ORC */
  
  
-
  #endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */
-
diff --git a/kernels/volk/volk_8ic_deinterleave_16i_x2.h b/kernels/volk/volk_8ic_deinterleave_16i_x2.h

index b4cf2510b69d58a02a7578a08d4ebf991a53eb9a..fa998a0eb1cb38ca82bd541a43f22ad73058084f 100644 (file)
--- a/kernels/volk/volk_8ic_deinterleave_16i_x2.h
+++ b/kernels/volk/volk_8ic_deinterleave_16i_x2.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_8ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_8ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t*
+ * complexVector, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li complexVector: The complex input vector.
@@ -60,91 +60,150 @@
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_8ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer, int16_t* qBuffer,
-                                   const lv_8sc_t* complexVector, unsigned int num_points)
+static inline void volk_8ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer,
+                                                       int16_t* qBuffer,
+                                                       const lv_8sc_t* complexVector,
+                                                       unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-  __m256i MoveMask = _mm256_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);  
-  __m256i complexVal, iOutputVal, qOutputVal;
-  __m128i iOutputVal0, qOutputVal0;
-
-  unsigned int sixteenthPoints = num_points / 16;
-
-  for(number = 0; number < sixteenthPoints; number++){
-    complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);  complexVectorPtr += 32;    
-
-    complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
-    complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
-
-    iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
-    qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
-
-    iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
-    iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
-
-    qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
-    qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
-
-    _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);   
-    _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
-
-    iBufferPtr += 16;
-    qBufferPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;   // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
-    *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
-  }
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    int16_t* qBufferPtr = qBuffer;
+    __m256i MoveMask = _mm256_set_epi8(15,
+                                       13,
+                                       11,
+                                       9,
+                                       7,
+                                       5,
+                                       3,
+                                       1,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0,
+                                       15,
+                                       13,
+                                       11,
+                                       9,
+                                       7,
+                                       5,
+                                       3,
+                                       1,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0);
+    __m256i complexVal, iOutputVal, qOutputVal;
+    __m128i iOutputVal0, qOutputVal0;
+
+    unsigned int sixteenthPoints = num_points / 16;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+        complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
+        complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+
+        iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
+        qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
+
+        iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
+        iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
+
+        qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
+        qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
+
+        _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
+        _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
+
+        iBufferPtr += 16;
+        qBufferPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ =
+            ((int16_t)*complexVectorPtr++) *
+            256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
+        *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
  #ifdef LV_HAVE_SSE4_1
  #include <smmintrin.h>
  
-static inline void
-volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16_t* qBuffer,
-                                      const lv_8sc_t* complexVector, unsigned int num_points)
+static inline void volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer,
+                                                         int16_t* qBuffer,
+                                                         const lv_8sc_t* complexVector,
+                                                         unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);  // set 16 byte values
-  __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
-  __m128i complexVal, iOutputVal, qOutputVal;
-
-  unsigned int eighthPoints = num_points / 8;
-
-  for(number = 0; number < eighthPoints; number++){
-    complexVal = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;   // aligned load
-
-    iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask);   // shuffle 16 bytes of 128bit complexVal
-    qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
-
-    iOutputVal = _mm_cvtepi8_epi16(iOutputVal);     // fills 2-byte sign extended versions of lower 8 bytes of input to output
-    iOutputVal = _mm_slli_epi16(iOutputVal, 8);     // shift in left by 8 bits, each of the 8 16-bit integers, shift in with zeros
-
-    qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
-    qOutputVal = _mm_slli_epi16(qOutputVal, 8);
-
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);  // aligned store
-    _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
-
-    iBufferPtr += 8;
-    qBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;   // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
-    *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
-  }
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    int16_t* qBufferPtr = qBuffer;
+    __m128i iMoveMask = _mm_set_epi8(0x80,
+                                     0x80,
+                                     0x80,
+                                     0x80,
+                                     0x80,
+                                     0x80,
+                                     0x80,
+                                     0x80,
+                                     14,
+                                     12,
+                                     10,
+                                     8,
+                                     6,
+                                     4,
+                                     2,
+                                     0); // set 16 byte values
+    __m128i qMoveMask = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+    __m128i complexVal, iOutputVal, qOutputVal;
+
+    unsigned int eighthPoints = num_points / 8;
+
+    for (number = 0; number < eighthPoints; number++) {
+        complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16; // aligned load
+
+        iOutputVal = _mm_shuffle_epi8(complexVal,
+                                      iMoveMask); // shuffle 16 bytes of 128bit complexVal
+        qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
+
+        iOutputVal = _mm_cvtepi8_epi16(iOutputVal); // fills 2-byte sign extended versions
+                                                    // of lower 8 bytes of input to output
+        iOutputVal =
+            _mm_slli_epi16(iOutputVal, 8); // shift in left by 8 bits, each of the 8
+                                           // 16-bit integers, shift in with zeros
+
+        qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
+        qOutputVal = _mm_slli_epi16(qOutputVal, 8);
+
+        _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store
+        _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+        iBufferPtr += 8;
+        qBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ =
+            ((int16_t)*complexVectorPtr++) *
+            256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
+        *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+    }
  }
  #endif /* LV_HAVE_SSE4_1 */
  
@@ -152,86 +211,111 @@ volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16_t* qBuffer,
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer, int16_t* qBuffer,
-                                   const lv_8sc_t* complexVector, unsigned int num_points)
+static inline void volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer,
+                                                      int16_t* qBuffer,
+                                                      const lv_8sc_t* complexVector,
+                                                      unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);  // set 16 byte values
-  __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
-  __m256i complexVal, iOutputVal, qOutputVal;
-  __m128i complexVal1, complexVal0;
-  __m128i iOutputVal1, iOutputVal0, qOutputVal1, qOutputVal0;
-
-  unsigned int sixteenthPoints = num_points / 16;
-
-  for(number = 0; number < sixteenthPoints; number++){
-    complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);  complexVectorPtr += 32;    // aligned load
-
-    // Extract from complexVal to iOutputVal and qOutputVal
-    complexVal1 = _mm256_extractf128_si256(complexVal, 1);
-    complexVal0 = _mm256_extractf128_si256(complexVal, 0);
-
-    iOutputVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask);     // shuffle 16 bytes of 128bit complexVal
-    iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask);
-    qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask);
-    qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask);
-
-    iOutputVal1 = _mm_cvtepi8_epi16(iOutputVal1);   // fills 2-byte sign extended versions of lower 8 bytes of input to output
-    iOutputVal1 = _mm_slli_epi16(iOutputVal1, 8);   // shift in left by 8 bits, each of the 8 16-bit integers, shift in with zeros
-    iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0);
-    iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8);
-
-    qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1);
-    qOutputVal1 = _mm_slli_epi16(qOutputVal1, 8);
-    qOutputVal0 = _mm_cvtepi8_epi16(qOutputVal0);
-    qOutputVal0 = _mm_slli_epi16(qOutputVal0, 8);
-
-    // Pack iOutputVal0,1 to iOutputVal
-    __m256i dummy = _mm256_setzero_si256();
-    iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0);
-    iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1);
-    qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0);
-    qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1);
-
-    _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);   // aligned store
-    _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
-
-    iBufferPtr += 16;
-    qBufferPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;   // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
-    *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
-  }
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    int16_t* qBufferPtr = qBuffer;
+    __m128i iMoveMask = _mm_set_epi8(0x80,
+                                     0x80,
+                                     0x80,
+                                     0x80,
+                                     0x80,
+                                     0x80,
+                                     0x80,
+                                     0x80,
+                                     14,
+                                     12,
+                                     10,
+                                     8,
+                                     6,
+                                     4,
+                                     2,
+                                     0); // set 16 byte values
+    __m128i qMoveMask = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+    __m256i complexVal, iOutputVal, qOutputVal;
+    __m128i complexVal1, complexVal0;
+    __m128i iOutputVal1, iOutputVal0, qOutputVal1, qOutputVal0;
+
+    unsigned int sixteenthPoints = num_points / 16;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+        complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32; // aligned load
+
+        // Extract from complexVal to iOutputVal and qOutputVal
+        complexVal1 = _mm256_extractf128_si256(complexVal, 1);
+        complexVal0 = _mm256_extractf128_si256(complexVal, 0);
+
+        iOutputVal1 = _mm_shuffle_epi8(
+            complexVal1, iMoveMask); // shuffle 16 bytes of 128bit complexVal
+        iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask);
+        qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask);
+        qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask);
+
+        iOutputVal1 =
+            _mm_cvtepi8_epi16(iOutputVal1); // fills 2-byte sign extended versions of
+                                            // lower 8 bytes of input to output
+        iOutputVal1 =
+            _mm_slli_epi16(iOutputVal1, 8); // shift in left by 8 bits, each of the 8
+                                            // 16-bit integers, shift in with zeros
+        iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0);
+        iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8);
+
+        qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1);
+        qOutputVal1 = _mm_slli_epi16(qOutputVal1, 8);
+        qOutputVal0 = _mm_cvtepi8_epi16(qOutputVal0);
+        qOutputVal0 = _mm_slli_epi16(qOutputVal0, 8);
+
+        // Pack iOutputVal0,1 to iOutputVal
+        __m256i dummy = _mm256_setzero_si256();
+        iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0);
+        iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1);
+        qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0);
+        qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1);
+
+        _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); // aligned store
+        _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
+
+        iBufferPtr += 16;
+        qBufferPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ =
+            ((int16_t)*complexVectorPtr++) *
+            256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
+        *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+    }
  }
  #endif /* LV_HAVE_AVX */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer,
-                                     const lv_8sc_t* complexVector, unsigned int num_points)
+static inline void volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer,
+                                                        int16_t* qBuffer,
+                                                        const lv_8sc_t* complexVector,
+                                                        unsigned int num_points)
  {
-  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-  unsigned int number;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
-    *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
-  }
+    const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    int16_t* qBufferPtr = qBuffer;
+    unsigned int number;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256;
+        *qBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
-
  #endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a_H */
  
  #ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_u_H
@@ -243,47 +327,82 @@ volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer,
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_8ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer, int16_t* qBuffer,
-                                   const lv_8sc_t* complexVector, unsigned int num_points)
+static inline void volk_8ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
+                                                       int16_t* qBuffer,
+                                                       const lv_8sc_t* complexVector,
+                                                       unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-  __m256i MoveMask = _mm256_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);  
-  __m256i complexVal, iOutputVal, qOutputVal;
-  __m128i iOutputVal0, qOutputVal0;
-
-  unsigned int sixteenthPoints = num_points / 16;
-
-  for(number = 0; number < sixteenthPoints; number++){
-    complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);  complexVectorPtr += 32;    
-
-    complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
-    complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
-
-    iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
-    qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
-
-    iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
-    iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
-
-    qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
-    qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
-
-    _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);   
-    _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
-
-    iBufferPtr += 16;
-    qBufferPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;   // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
-    *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
-  }
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    int16_t* qBufferPtr = qBuffer;
+    __m256i MoveMask = _mm256_set_epi8(15,
+                                       13,
+                                       11,
+                                       9,
+                                       7,
+                                       5,
+                                       3,
+                                       1,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0,
+                                       15,
+                                       13,
+                                       11,
+                                       9,
+                                       7,
+                                       5,
+                                       3,
+                                       1,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0);
+    __m256i complexVal, iOutputVal, qOutputVal;
+    __m128i iOutputVal0, qOutputVal0;
+
+    unsigned int sixteenthPoints = num_points / 16;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+        complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
+        complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+
+        iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
+        qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
+
+        iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
+        iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
+
+        qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
+        qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
+
+        _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
+        _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
+
+        iBufferPtr += 16;
+        qBufferPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ =
+            ((int16_t)*complexVectorPtr++) *
+            256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
+        *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  #endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_u_H */
diff --git a/kernels/volk/volk_8ic_deinterleave_real_16i.h b/kernels/volk/volk_8ic_deinterleave_real_16i.h

index f15879a3b550a8002555dd08bf30d1013076c353..aaebb472d603e3a32e95250cadfc78cb847e8875 100644 (file)
--- a/kernels/volk/volk_8ic_deinterleave_real_16i.h
+++ b/kernels/volk/volk_8ic_deinterleave_real_16i.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_8ic_deinterleave_real_16i(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_8ic_deinterleave_real_16i(int16_t* iBuffer, const lv_8sc_t* complexVector,
+ * unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li complexVector: The complex input vector.
@@ -60,75 +60,109 @@
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_8ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer, const lv_8sc_t* complexVector,
-                                     unsigned int num_points)
+static inline void volk_8ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer,
+                                                         const lv_8sc_t* complexVector,
+                                                         unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-  __m256i complexVal, outputVal;
-  __m128i outputVal0;
-
-  unsigned int sixteenthPoints = num_points / 16;
-
-  for(number = 0; number < sixteenthPoints; number++){
-    complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);  complexVectorPtr += 32;
-
-    complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
-    complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
-
-    outputVal0 = _mm256_extractf128_si256(complexVal, 0);
-
-    outputVal = _mm256_cvtepi8_epi16(outputVal0);
-    outputVal = _mm256_slli_epi16(outputVal, 7);
-
-    _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
-
-    iBufferPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
-    complexVectorPtr++;
-  }
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    __m256i moveMask = _mm256_set_epi8(0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0);
+    __m256i complexVal, outputVal;
+    __m128i outputVal0;
+
+    unsigned int sixteenthPoints = num_points / 16;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+        complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
+        complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+
+        outputVal0 = _mm256_extractf128_si256(complexVal, 0);
+
+        outputVal = _mm256_cvtepi8_epi16(outputVal0);
+        outputVal = _mm256_slli_epi16(outputVal, 7);
+
+        _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
+
+        iBufferPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
  #ifdef LV_HAVE_SSE4_1
  #include <smmintrin.h>
  
-static inline void
-volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector,
-                                        unsigned int num_points)
+static inline void volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer,
+                                                           const lv_8sc_t* complexVector,
+                                                           unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-  __m128i complexVal, outputVal;
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    __m128i moveMask = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m128i complexVal, outputVal;
  
-  unsigned int eighthPoints = num_points / 8;
+    unsigned int eighthPoints = num_points / 8;
  
-  for(number = 0; number < eighthPoints; number++){
-    complexVal = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+    for (number = 0; number < eighthPoints; number++) {
+        complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16;
  
-    complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+        complexVal = _mm_shuffle_epi8(complexVal, moveMask);
  
-    outputVal = _mm_cvtepi8_epi16(complexVal);
-    outputVal = _mm_slli_epi16(outputVal, 7);
+        outputVal = _mm_cvtepi8_epi16(complexVal);
+        outputVal = _mm_slli_epi16(outputVal, 7);
  
-    _mm_store_si128((__m128i*)iBufferPtr, outputVal);
-    iBufferPtr += 8;
-  }
+        _mm_store_si128((__m128i*)iBufferPtr, outputVal);
+        iBufferPtr += 8;
+    }
  
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
-    complexVectorPtr++;
-  }
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_SSE4_1 */
  
@@ -136,63 +170,65 @@ volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer, const lv_8sc_t* comple
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_8ic_deinterleave_real_16i_a_avx(int16_t* iBuffer, const lv_8sc_t* complexVector,
-                                     unsigned int num_points)
+static inline void volk_8ic_deinterleave_real_16i_a_avx(int16_t* iBuffer,
+                                                        const lv_8sc_t* complexVector,
+                                                        unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-  __m256i complexVal, outputVal;
-  __m128i complexVal1, complexVal0, outputVal1, outputVal0;
-
-  unsigned int sixteenthPoints = num_points / 16;
-
-  for(number = 0; number < sixteenthPoints; number++){
-    complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);  complexVectorPtr += 32;
-
-    complexVal1 = _mm256_extractf128_si256(complexVal, 1);
-    complexVal0 = _mm256_extractf128_si256(complexVal, 0);
-
-    outputVal1 = _mm_shuffle_epi8(complexVal1, moveMask);
-    outputVal0 = _mm_shuffle_epi8(complexVal0, moveMask);
-
-    outputVal1 = _mm_cvtepi8_epi16(outputVal1);
-    outputVal1 = _mm_slli_epi16(outputVal1, 7);
-    outputVal0 = _mm_cvtepi8_epi16(outputVal0);
-    outputVal0 = _mm_slli_epi16(outputVal0, 7);
-
-    __m256i dummy = _mm256_setzero_si256();
-    outputVal = _mm256_insertf128_si256(dummy, outputVal0, 0);
-    outputVal = _mm256_insertf128_si256(outputVal, outputVal1, 1);
-    _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
-
-    iBufferPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
-    complexVectorPtr++;
-  }
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    __m128i moveMask = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m256i complexVal, outputVal;
+    __m128i complexVal1, complexVal0, outputVal1, outputVal0;
+
+    unsigned int sixteenthPoints = num_points / 16;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+        complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        complexVal1 = _mm256_extractf128_si256(complexVal, 1);
+        complexVal0 = _mm256_extractf128_si256(complexVal, 0);
+
+        outputVal1 = _mm_shuffle_epi8(complexVal1, moveMask);
+        outputVal0 = _mm_shuffle_epi8(complexVal0, moveMask);
+
+        outputVal1 = _mm_cvtepi8_epi16(outputVal1);
+        outputVal1 = _mm_slli_epi16(outputVal1, 7);
+        outputVal0 = _mm_cvtepi8_epi16(outputVal0);
+        outputVal0 = _mm_slli_epi16(outputVal0, 7);
+
+        __m256i dummy = _mm256_setzero_si256();
+        outputVal = _mm256_insertf128_si256(dummy, outputVal0, 0);
+        outputVal = _mm256_insertf128_si256(outputVal, outputVal1, 1);
+        _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
+
+        iBufferPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_8ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_8sc_t* complexVector,
-                                       unsigned int num_points)
+static inline void volk_8ic_deinterleave_real_16i_generic(int16_t* iBuffer,
+                                                          const lv_8sc_t* complexVector,
+                                                          unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128;
-    complexVectorPtr++;
-  }
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -209,40 +245,72 @@ volk_8ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_8sc_t* complex
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_8ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer, const lv_8sc_t* complexVector,
-                                     unsigned int num_points)
+static inline void volk_8ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
+                                                         const lv_8sc_t* complexVector,
+                                                         unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-  __m256i complexVal, outputVal;
-  __m128i outputVal0;
-
-  unsigned int sixteenthPoints = num_points / 16;
-
-  for(number = 0; number < sixteenthPoints; number++){
-    complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);  complexVectorPtr += 32;
-
-    complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
-    complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
-
-    outputVal0 = _mm256_extractf128_si256(complexVal, 0);
-
-    outputVal = _mm256_cvtepi8_epi16(outputVal0);
-    outputVal = _mm256_slli_epi16(outputVal, 7);
-
-    _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal);
-
-    iBufferPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
-    complexVectorPtr++;
-  }
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    __m256i moveMask = _mm256_set_epi8(0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0);
+    __m256i complexVal, outputVal;
+    __m128i outputVal0;
+
+    unsigned int sixteenthPoints = num_points / 16;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+        complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
+        complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+
+        outputVal0 = _mm256_extractf128_si256(complexVal, 0);
+
+        outputVal = _mm256_cvtepi8_epi16(outputVal0);
+        outputVal = _mm256_slli_epi16(outputVal, 7);
+
+        _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal);
+
+        iBufferPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  #endif /* INCLUDED_volk_8ic_deinterleave_real_16i_u_H */
diff --git a/kernels/volk/volk_8ic_deinterleave_real_8i.h b/kernels/volk/volk_8ic_deinterleave_real_8i.h

index 6cc3f15f2ca3ce5912224c32296bff916e98e8be..a1a835d54310cf3b1390abf55b54e9f638c93608 100644 (file)
--- a/kernels/volk/volk_8ic_deinterleave_real_8i.h
+++ b/kernels/volk/volk_8ic_deinterleave_real_8i.h
@@ -30,8 +30,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_8ic_deinterleave_real_8i(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_8ic_deinterleave_real_8i(int8_t* iBuffer, const lv_8sc_t* complexVector,
+ * unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li complexVector: The complex input vector.
@@ -59,40 +59,102 @@
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_8ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, const lv_8sc_t* complexVector,
-                                    unsigned int num_points)
+static inline void volk_8ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer,
+                                                        const lv_8sc_t* complexVector,
+                                                        unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int8_t* iBufferPtr = iBuffer;
-  __m256i moveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-  __m256i moveMask2 = _mm256_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-  __m256i complexVal1, complexVal2, outputVal;
-
-  unsigned int thirtysecondPoints = num_points / 32;
-
-  for(number = 0; number < thirtysecondPoints; number++){
-
-    complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
-    complexVectorPtr += 32;
-    complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
-    complexVectorPtr += 32;
-
-    complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
-    complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
-    outputVal = _mm256_or_si256(complexVal1, complexVal2);
-    outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
-
-    _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
-    iBufferPtr += 32;
-  }
-
-  number = thirtysecondPoints * 32;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int8_t* iBufferPtr = iBuffer;
+    __m256i moveMask1 = _mm256_set_epi8(0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        14,
+                                        12,
+                                        10,
+                                        8,
+                                        6,
+                                        4,
+                                        2,
+                                        0,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        14,
+                                        12,
+                                        10,
+                                        8,
+                                        6,
+                                        4,
+                                        2,
+                                        0);
+    __m256i moveMask2 = _mm256_set_epi8(14,
+                                        12,
+                                        10,
+                                        8,
+                                        6,
+                                        4,
+                                        2,
+                                        0,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        14,
+                                        12,
+                                        10,
+                                        8,
+                                        6,
+                                        4,
+                                        2,
+                                        0,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80);
+    __m256i complexVal1, complexVal2, outputVal;
+
+    unsigned int thirtysecondPoints = num_points / 32;
+
+    for (number = 0; number < thirtysecondPoints; number++) {
+
+        complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
+        complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
+        outputVal = _mm256_or_si256(complexVal1, complexVal2);
+        outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
+
+        _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
+        iBufferPtr += 32;
+    }
+
+    number = thirtysecondPoints * 32;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -100,37 +162,41 @@ volk_8ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, const lv_8sc_t* complexVec
  #ifdef LV_HAVE_SSSE3
  #include <tmmintrin.h>
  
-static inline void
-volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector,
-                                      unsigned int num_points)
+static inline void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer,
+                                                         const lv_8sc_t* complexVector,
+                                                         unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int8_t* iBufferPtr = iBuffer;
-  __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-  __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-  __m128i complexVal1, complexVal2, outputVal;
-
-  unsigned int sixteenthPoints = num_points / 16;
-
-  for(number = 0; number < sixteenthPoints; number++){
-    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-
-    complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1);
-    complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2);
-
-    outputVal = _mm_or_si128(complexVal1, complexVal2);
-
-    _mm_store_si128((__m128i*)iBufferPtr, outputVal);
-    iBufferPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int8_t* iBufferPtr = iBuffer;
+    __m128i moveMask1 = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m128i moveMask2 = _mm_set_epi8(
+        14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+    __m128i complexVal1, complexVal2, outputVal;
+
+    unsigned int sixteenthPoints = num_points / 16;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+        complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16;
+
+        complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1);
+        complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2);
+
+        outputVal = _mm_or_si128(complexVal1, complexVal2);
+
+        _mm_store_si128((__m128i*)iBufferPtr, outputVal);
+        iBufferPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_SSSE3 */
  
@@ -138,72 +204,75 @@ volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVe
  #ifdef LV_HAVE_AVX
  #include <immintrin.h>
  
-static inline void
-volk_8ic_deinterleave_real_8i_a_avx(int8_t* iBuffer, const lv_8sc_t* complexVector,
-                                    unsigned int num_points)
+static inline void volk_8ic_deinterleave_real_8i_a_avx(int8_t* iBuffer,
+                                                       const lv_8sc_t* complexVector,
+                                                       unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int8_t* iBufferPtr = iBuffer;
-  __m128i moveMaskL = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-  __m128i moveMaskH = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-  __m256i complexVal1, complexVal2, outputVal;
-  __m128i complexVal1H, complexVal1L, complexVal2H, complexVal2L, outputVal1, outputVal2;
-
-  unsigned int thirtysecondPoints = num_points / 32;
-
-  for(number = 0; number < thirtysecondPoints; number++){
-
-    complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
-    complexVectorPtr += 32;
-    complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
-    complexVectorPtr += 32;
-
-    complexVal1H = _mm256_extractf128_si256(complexVal1, 1);
-    complexVal1L = _mm256_extractf128_si256(complexVal1, 0);
-    complexVal2H = _mm256_extractf128_si256(complexVal2, 1);
-    complexVal2L = _mm256_extractf128_si256(complexVal2, 0);
-
-    complexVal1H = _mm_shuffle_epi8(complexVal1H, moveMaskH);
-    complexVal1L = _mm_shuffle_epi8(complexVal1L, moveMaskL);
-    outputVal1 = _mm_or_si128(complexVal1H, complexVal1L);
-
-
-    complexVal2H = _mm_shuffle_epi8(complexVal2H, moveMaskH);
-    complexVal2L = _mm_shuffle_epi8(complexVal2L, moveMaskL);
-    outputVal2 = _mm_or_si128(complexVal2H, complexVal2L);
-
-    __m256i dummy = _mm256_setzero_si256();
-    outputVal = _mm256_insertf128_si256(dummy, outputVal1, 0);
-    outputVal = _mm256_insertf128_si256(outputVal, outputVal2, 1);
-
-
-    _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
-    iBufferPtr += 32;
-  }
-
-  number = thirtysecondPoints * 32;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int8_t* iBufferPtr = iBuffer;
+    __m128i moveMaskL = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m128i moveMaskH = _mm_set_epi8(
+        14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+    __m256i complexVal1, complexVal2, outputVal;
+    __m128i complexVal1H, complexVal1L, complexVal2H, complexVal2L, outputVal1,
+        outputVal2;
+
+    unsigned int thirtysecondPoints = num_points / 32;
+
+    for (number = 0; number < thirtysecondPoints; number++) {
+
+        complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        complexVal1H = _mm256_extractf128_si256(complexVal1, 1);
+        complexVal1L = _mm256_extractf128_si256(complexVal1, 0);
+        complexVal2H = _mm256_extractf128_si256(complexVal2, 1);
+        complexVal2L = _mm256_extractf128_si256(complexVal2, 0);
+
+        complexVal1H = _mm_shuffle_epi8(complexVal1H, moveMaskH);
+        complexVal1L = _mm_shuffle_epi8(complexVal1L, moveMaskL);
+        outputVal1 = _mm_or_si128(complexVal1H, complexVal1L);
+
+
+        complexVal2H = _mm_shuffle_epi8(complexVal2H, moveMaskH);
+        complexVal2L = _mm_shuffle_epi8(complexVal2L, moveMaskL);
+        outputVal2 = _mm_or_si128(complexVal2H, complexVal2L);
+
+        __m256i dummy = _mm256_setzero_si256();
+        outputVal = _mm256_insertf128_si256(dummy, outputVal1, 0);
+        outputVal = _mm256_insertf128_si256(outputVal, outputVal2, 1);
+
+
+        _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
+        iBufferPtr += 32;
+    }
+
+    number = thirtysecondPoints * 32;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX */
  
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_8sc_t* complexVector,
-                                      unsigned int num_points)
+static inline void volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer,
+                                                         const lv_8sc_t* complexVector,
+                                                         unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int8_t* iBufferPtr = iBuffer;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int8_t* iBufferPtr = iBuffer;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -211,26 +280,27 @@ volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_8sc_t* complexVe
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>
  
-static inline void
-volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points)
+static inline void volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer,
+                                                      const lv_8sc_t* complexVector,
+                                                      unsigned int num_points)
  {
-  unsigned int number;
-  unsigned int sixteenth_points = num_points / 16;
-
-  int8x16x2_t input_vector;
-  for(number=0; number < sixteenth_points; ++number) {
-    input_vector = vld2q_s8((int8_t*) complexVector );
-    vst1q_s8(iBuffer, input_vector.val[0]);
-    iBuffer += 16;
-    complexVector += 16;
-  }
-
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int8_t* iBufferPtr = iBuffer;
-  for(number = sixteenth_points*16; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
+    unsigned int number;
+    unsigned int sixteenth_points = num_points / 16;
+
+    int8x16x2_t input_vector;
+    for (number = 0; number < sixteenth_points; ++number) {
+        input_vector = vld2q_s8((int8_t*)complexVector);
+        vst1q_s8(iBuffer, input_vector.val[0]);
+        iBuffer += 16;
+        complexVector += 16;
+    }
+
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int8_t* iBufferPtr = iBuffer;
+    for (number = sixteenth_points * 16; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_NEON */
  
@@ -246,40 +316,102 @@ volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer, const lv_8sc_t* complexVecto
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_8ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer, const lv_8sc_t* complexVector,
-                                    unsigned int num_points)
+static inline void volk_8ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
+                                                        const lv_8sc_t* complexVector,
+                                                        unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int8_t* iBufferPtr = iBuffer;
-  __m256i moveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-  __m256i moveMask2 = _mm256_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-  __m256i complexVal1, complexVal2, outputVal;
-
-  unsigned int thirtysecondPoints = num_points / 32;
-
-  for(number = 0; number < thirtysecondPoints; number++){
-
-    complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
-    complexVectorPtr += 32;
-    complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
-    complexVectorPtr += 32;
-
-    complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
-    complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
-    outputVal = _mm256_or_si256(complexVal1, complexVal2);
-    outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
-
-    _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal);
-    iBufferPtr += 32;
-  }
-
-  number = thirtysecondPoints * 32;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int8_t* iBufferPtr = iBuffer;
+    __m256i moveMask1 = _mm256_set_epi8(0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        14,
+                                        12,
+                                        10,
+                                        8,
+                                        6,
+                                        4,
+                                        2,
+                                        0,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        14,
+                                        12,
+                                        10,
+                                        8,
+                                        6,
+                                        4,
+                                        2,
+                                        0);
+    __m256i moveMask2 = _mm256_set_epi8(14,
+                                        12,
+                                        10,
+                                        8,
+                                        6,
+                                        4,
+                                        2,
+                                        0,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        14,
+                                        12,
+                                        10,
+                                        8,
+                                        6,
+                                        4,
+                                        2,
+                                        0,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80);
+    __m256i complexVal1, complexVal2, outputVal;
+
+    unsigned int thirtysecondPoints = num_points / 32;
+
+    for (number = 0; number < thirtysecondPoints; number++) {
+
+        complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
+        complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
+        outputVal = _mm256_or_si256(complexVal1, complexVal2);
+        outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
+
+        _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal);
+        iBufferPtr += 32;
+    }
+
+    number = thirtysecondPoints * 32;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
diff --git a/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h b/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h

index 736f7c03345d957cb749526145db6e3e4f61df63..f62275207f38da558715316e81f79d8ecbbb2f0e 100644 (file)
--- a/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h
+++ b/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h
@@ -31,8 +31,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_8ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points)
- * \endcode
+ * void volk_8ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_8sc_t*
+ * complexVector, const float scalar, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li complexVector: The complex input vector.
@@ -56,74 +56,79 @@
  #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
  #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
  #include <stdio.h>
+#include <volk/volk_common.h>
  
  
  #ifdef LV_HAVE_SSE4_1
  #include <smmintrin.h>
  
  static inline void
-volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector,
-                                           const float scalar, unsigned int num_points)
+volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer,
+                                           float* qBuffer,
+                                           const lv_8sc_t* complexVector,
+                                           const float scalar,
+                                           unsigned int num_points)
  {
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-  __m128 iFloatValue, qFloatValue;
-
-  const float iScalar= 1.0 / scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-  __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
-
-  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-  __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
-
-  for(;number < eighthPoints; number++){
-    complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
-    iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
-    qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
-
-    iIntVal = _mm_cvtepi8_epi32(iComplexVal);
-    iFloatValue = _mm_cvtepi32_ps(iIntVal);
-    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
-    _mm_store_ps(iBufferPtr, iFloatValue);
-    iBufferPtr += 4;
-
-    iComplexVal = _mm_srli_si128(iComplexVal, 4);
-
-    iIntVal = _mm_cvtepi8_epi32(iComplexVal);
-    iFloatValue = _mm_cvtepi32_ps(iIntVal);
-    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
-    _mm_store_ps(iBufferPtr, iFloatValue);
-    iBufferPtr += 4;
-
-    qIntVal = _mm_cvtepi8_epi32(qComplexVal);
-    qFloatValue = _mm_cvtepi32_ps(qIntVal);
-    qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
-    _mm_store_ps(qBufferPtr, qFloatValue);
-    qBufferPtr += 4;
-
-    qComplexVal = _mm_srli_si128(qComplexVal, 4);
-
-    qIntVal = _mm_cvtepi8_epi32(qComplexVal);
-    qFloatValue = _mm_cvtepi32_ps(qIntVal);
-    qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
-    _mm_store_ps(qBufferPtr, qFloatValue);
-
-    qBufferPtr += 4;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
-    *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
-  }
-
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+    __m128 iFloatValue, qFloatValue;
+
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
+    int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+    __m128i iMoveMask = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m128i qMoveMask = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+
+    for (; number < eighthPoints; number++) {
+        complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
+        qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
+
+        iIntVal = _mm_cvtepi8_epi32(iComplexVal);
+        iFloatValue = _mm_cvtepi32_ps(iIntVal);
+        iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+        _mm_store_ps(iBufferPtr, iFloatValue);
+        iBufferPtr += 4;
+
+        iComplexVal = _mm_srli_si128(iComplexVal, 4);
+
+        iIntVal = _mm_cvtepi8_epi32(iComplexVal);
+        iFloatValue = _mm_cvtepi32_ps(iIntVal);
+        iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+        _mm_store_ps(iBufferPtr, iFloatValue);
+        iBufferPtr += 4;
+
+        qIntVal = _mm_cvtepi8_epi32(qComplexVal);
+        qFloatValue = _mm_cvtepi32_ps(qIntVal);
+        qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
+        _mm_store_ps(qBufferPtr, qFloatValue);
+        qBufferPtr += 4;
+
+        qComplexVal = _mm_srli_si128(qComplexVal, 4);
+
+        qIntVal = _mm_cvtepi8_epi32(qComplexVal);
+        qFloatValue = _mm_cvtepi32_ps(qIntVal);
+        qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
+        _mm_store_ps(qBufferPtr, qFloatValue);
+
+        qBufferPtr += 4;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+        *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+    }
  }
  #endif /* LV_HAVE_SSE4_1 */
  
@@ -131,59 +136,60 @@ volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer, float* qBuffer, const
  #ifdef LV_HAVE_SSE
  #include <xmmintrin.h>
  
-static inline void
-volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer,
-                                        const lv_8sc_t* complexVector,
-                                        const float scalar, unsigned int num_points)
+static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer,
+                                                           float* qBuffer,
+                                                           const lv_8sc_t* complexVector,
+                                                           const float scalar,
+                                                           unsigned int num_points)
  {
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
  
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-  __m128 cplxValue1, cplxValue2, iValue, qValue;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    __m128 cplxValue1, cplxValue2, iValue, qValue;
  
-  __m128 invScalar = _mm_set_ps1(1.0/scalar);
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
+    __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+    int8_t* complexVectorPtr = (int8_t*)complexVector;
  
-  __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
+    __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
  
-  for(;number < quarterPoints; number++){
-    floatBuffer[0] = (float)(complexVectorPtr[0]);
-    floatBuffer[1] = (float)(complexVectorPtr[1]);
-    floatBuffer[2] = (float)(complexVectorPtr[2]);
-    floatBuffer[3] = (float)(complexVectorPtr[3]);
+    for (; number < quarterPoints; number++) {
+        floatBuffer[0] = (float)(complexVectorPtr[0]);
+        floatBuffer[1] = (float)(complexVectorPtr[1]);
+        floatBuffer[2] = (float)(complexVectorPtr[2]);
+        floatBuffer[3] = (float)(complexVectorPtr[3]);
  
-    floatBuffer[4] = (float)(complexVectorPtr[4]);
-    floatBuffer[5] = (float)(complexVectorPtr[5]);
-    floatBuffer[6] = (float)(complexVectorPtr[6]);
-    floatBuffer[7] = (float)(complexVectorPtr[7]);
+        floatBuffer[4] = (float)(complexVectorPtr[4]);
+        floatBuffer[5] = (float)(complexVectorPtr[5]);
+        floatBuffer[6] = (float)(complexVectorPtr[6]);
+        floatBuffer[7] = (float)(complexVectorPtr[7]);
  
-    cplxValue1 = _mm_load_ps(&floatBuffer[0]);
-    cplxValue2 = _mm_load_ps(&floatBuffer[4]);
+        cplxValue1 = _mm_load_ps(&floatBuffer[0]);
+        cplxValue2 = _mm_load_ps(&floatBuffer[4]);
  
-    complexVectorPtr += 8;
+        complexVectorPtr += 8;
  
-    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
-    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+        cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+        cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
  
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+        // Arrange in i1i2i3i4 format
+        iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+        qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
  
-    _mm_store_ps(iBufferPtr, iValue);
-    _mm_store_ps(qBufferPtr, qValue);
+        _mm_store_ps(iBufferPtr, iValue);
+        _mm_store_ps(qBufferPtr, qValue);
  
-    iBufferPtr += 4;
-    qBufferPtr += 4;
-  }
+        iBufferPtr += 4;
+        qBufferPtr += 4;
+    }
  
-  number = quarterPoints * 4;
-  complexVectorPtr = (int8_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-    *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-  }
+    number = quarterPoints * 4;
+    complexVectorPtr = (int8_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+        *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+    }
  }
  #endif /* LV_HAVE_SSE */
  
@@ -191,70 +197,127 @@ volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer,
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector,
-                                           const float scalar, unsigned int num_points)
+static inline void volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer,
+                                                            float* qBuffer,
+                                                            const lv_8sc_t* complexVector,
+                                                            const float scalar,
+                                                            unsigned int num_points)
  {
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-  __m256 iFloatValue, qFloatValue;
-
-  const float iScalar= 1.0 / scalar;
-  __m256 invScalar = _mm256_set1_ps(iScalar);
-  __m256i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
-
-  __m256i iMoveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-                                      14, 12, 10, 8, 6, 4, 2, 0,
-                                      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-                                      14, 12, 10, 8, 6, 4, 2, 0);
-  __m256i qMoveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-                                      15, 13, 11, 9, 7, 5, 3, 1,
-                                      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-                                      15, 13, 11, 9, 7, 5, 3, 1);
-
-  for(;number < sixteenthPoints; number++){
-    complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
-    complexVectorPtr += 32;
-    iComplexVal = _mm256_shuffle_epi8(complexVal, iMoveMask);
-    qComplexVal = _mm256_shuffle_epi8(complexVal, qMoveMask);
-
-    iIntVal     = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
-    iFloatValue = _mm256_cvtepi32_ps(iIntVal);
-    iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
-    _mm256_store_ps(iBufferPtr, iFloatValue);
-    iBufferPtr += 8;
-
-    iComplexVal = _mm256_permute4x64_epi64(iComplexVal, 0b11000110);
-    iIntVal     = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
-    iFloatValue = _mm256_cvtepi32_ps(iIntVal);
-    iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
-    _mm256_store_ps(iBufferPtr, iFloatValue);
-    iBufferPtr += 8;
-
-    qIntVal     = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
-    qFloatValue = _mm256_cvtepi32_ps(qIntVal);
-    qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
-    _mm256_store_ps(qBufferPtr, qFloatValue);
-    qBufferPtr += 8;
-
-    qComplexVal = _mm256_permute4x64_epi64(qComplexVal, 0b11000110);
-    qIntVal     = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
-    qFloatValue = _mm256_cvtepi32_ps(qIntVal);
-    qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
-    _mm256_store_ps(qBufferPtr, qFloatValue);
-    qBufferPtr += 8;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
-    *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
-  }
-
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    __m256 iFloatValue, qFloatValue;
+
+    const float iScalar = 1.0 / scalar;
+    __m256 invScalar = _mm256_set1_ps(iScalar);
+    __m256i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
+    int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+    __m256i iMoveMask = _mm256_set_epi8(0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        14,
+                                        12,
+                                        10,
+                                        8,
+                                        6,
+                                        4,
+                                        2,
+                                        0,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        14,
+                                        12,
+                                        10,
+                                        8,
+                                        6,
+                                        4,
+                                        2,
+                                        0);
+    __m256i qMoveMask = _mm256_set_epi8(0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        15,
+                                        13,
+                                        11,
+                                        9,
+                                        7,
+                                        5,
+                                        3,
+                                        1,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        15,
+                                        13,
+                                        11,
+                                        9,
+                                        7,
+                                        5,
+                                        3,
+                                        1);
+
+    for (; number < sixteenthPoints; number++) {
+        complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        iComplexVal = _mm256_shuffle_epi8(complexVal, iMoveMask);
+        qComplexVal = _mm256_shuffle_epi8(complexVal, qMoveMask);
+
+        iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
+        iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+        iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+        _mm256_store_ps(iBufferPtr, iFloatValue);
+        iBufferPtr += 8;
+
+        iComplexVal = _mm256_permute4x64_epi64(iComplexVal, 0b11000110);
+        iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
+        iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+        iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+        _mm256_store_ps(iBufferPtr, iFloatValue);
+        iBufferPtr += 8;
+
+        qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
+        qFloatValue = _mm256_cvtepi32_ps(qIntVal);
+        qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
+        _mm256_store_ps(qBufferPtr, qFloatValue);
+        qBufferPtr += 8;
+
+        qComplexVal = _mm256_permute4x64_epi64(qComplexVal, 0b11000110);
+        qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
+        qFloatValue = _mm256_cvtepi32_ps(qIntVal);
+        qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
+        _mm256_store_ps(qBufferPtr, qFloatValue);
+        qBufferPtr += 8;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+        *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -262,19 +325,21 @@ volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, float* qBuffer, const l
  #ifdef LV_HAVE_GENERIC
  
  static inline void
-volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer,
+volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer,
+                                          float* qBuffer,
                                            const lv_8sc_t* complexVector,
-                                          const float scalar, unsigned int num_points)
+                                          const float scalar,
+                                          unsigned int num_points)
  {
-  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-  unsigned int number;
-  const float invScalar = 1.0 / scalar;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
-    *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
-  }
+    const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+    unsigned int number;
+    const float invScalar = 1.0 / scalar;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) * invScalar;
+        *qBufferPtr++ = (float)(*complexVectorPtr++) * invScalar;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -285,75 +350,107 @@ volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer,
  #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H
  #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
  #include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_8ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector,
-                                           const float scalar, unsigned int num_points)
+static inline void volk_8ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer,
+                                                            float* qBuffer,
+                                                            const lv_8sc_t* complexVector,
+                                                            const float scalar,
+                                                            unsigned int num_points)
  {
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-  __m256 iFloatValue, qFloatValue;
-
-  const float iScalar= 1.0 / scalar;
-  __m256 invScalar = _mm256_set1_ps(iScalar);
-  __m256i complexVal, iIntVal, qIntVal;
-  __m128i iComplexVal, qComplexVal;
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
-
-  __m256i MoveMask = _mm256_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8,
-      6, 4, 2, 0,15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
-
-  for(;number < sixteenthPoints; number++){
-    complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
-    complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
-    complexVal = _mm256_permute4x64_epi64(complexVal,0xd8);
-    iComplexVal = _mm256_extractf128_si256(complexVal,0);
-    qComplexVal = _mm256_extractf128_si256(complexVal,1);
-
-    iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
-    iFloatValue = _mm256_cvtepi32_ps(iIntVal);
-    iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
-    _mm256_storeu_ps(iBufferPtr, iFloatValue);
-    iBufferPtr += 8;
-
-    qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
-    qFloatValue = _mm256_cvtepi32_ps(qIntVal);
-    qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
-    _mm256_storeu_ps(qBufferPtr, qFloatValue);
-    qBufferPtr += 8;
-
-    complexVal = _mm256_srli_si256(complexVal, 8);
-    iComplexVal = _mm256_extractf128_si256(complexVal,0);
-    qComplexVal = _mm256_extractf128_si256(complexVal,1);
-
-    iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
-    iFloatValue = _mm256_cvtepi32_ps(iIntVal);
-    iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
-    _mm256_storeu_ps(iBufferPtr, iFloatValue);
-    iBufferPtr += 8;
-
-    qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
-    qFloatValue = _mm256_cvtepi32_ps(qIntVal);
-    qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
-    _mm256_storeu_ps(qBufferPtr, qFloatValue);
-    qBufferPtr += 8;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
-    *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
-  }
-
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    __m256 iFloatValue, qFloatValue;
+
+    const float iScalar = 1.0 / scalar;
+    __m256 invScalar = _mm256_set1_ps(iScalar);
+    __m256i complexVal, iIntVal, qIntVal;
+    __m128i iComplexVal, qComplexVal;
+    int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+    __m256i MoveMask = _mm256_set_epi8(15,
+                                       13,
+                                       11,
+                                       9,
+                                       7,
+                                       5,
+                                       3,
+                                       1,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0,
+                                       15,
+                                       13,
+                                       11,
+                                       9,
+                                       7,
+                                       5,
+                                       3,
+                                       1,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0);
+
+    for (; number < sixteenthPoints; number++) {
+        complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
+        complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+        iComplexVal = _mm256_extractf128_si256(complexVal, 0);
+        qComplexVal = _mm256_extractf128_si256(complexVal, 1);
+
+        iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
+        iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+        iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+        _mm256_storeu_ps(iBufferPtr, iFloatValue);
+        iBufferPtr += 8;
+
+        qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
+        qFloatValue = _mm256_cvtepi32_ps(qIntVal);
+        qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
+        _mm256_storeu_ps(qBufferPtr, qFloatValue);
+        qBufferPtr += 8;
+
+        complexVal = _mm256_srli_si256(complexVal, 8);
+        iComplexVal = _mm256_extractf128_si256(complexVal, 0);
+        qComplexVal = _mm256_extractf128_si256(complexVal, 1);
+
+        iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
+        iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+        iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+        _mm256_storeu_ps(iBufferPtr, iFloatValue);
+        iBufferPtr += 8;
+
+        qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
+        qFloatValue = _mm256_cvtepi32_ps(qIntVal);
+        qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
+        _mm256_storeu_ps(qBufferPtr, qFloatValue);
+        qBufferPtr += 8;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+        *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
diff --git a/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h b/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h

index 0c85ee9aa71f4f8a03a010b93fbf8e11a5412a24..4c1afe7c752edf458b1038379cccd0ba5d12c76b 100644 (file)
--- a/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h
+++ b/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h
@@ -31,8 +31,8 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_8ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points)
- * \endcode
+ * void volk_8ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_8sc_t* complexVector,
+ * const float scalar, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li complexVector: The complex input vector.
@@ -55,57 +55,86 @@
  #ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H
  #define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
  #include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
  static inline void
-volk_8ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_8sc_t* complexVector,
-                                             const float scalar, unsigned int num_points)
+volk_8ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer,
+                                           const lv_8sc_t* complexVector,
+                                           const float scalar,
+                                           unsigned int num_points)
  {
-  float* iBufferPtr = iBuffer;
-
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-  __m256 iFloatValue;
-
-  const float iScalar= 1.0 / scalar;
-  __m256 invScalar = _mm256_set1_ps(iScalar);
-  __m256i complexVal, iIntVal;
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
-
-  __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-                                     14, 12, 10, 8, 6, 4, 2, 0,
-                                     0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-                                     14, 12, 10, 8, 6, 4, 2, 0);
-  for(;number < sixteenthPoints; number++){
-    complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
-    complexVectorPtr += 32;
-    complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
-
-    iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal));
-    iFloatValue = _mm256_cvtepi32_ps(iIntVal);
-    iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
-    _mm256_store_ps(iBufferPtr, iFloatValue);
-    iBufferPtr += 8;
-
-    complexVal = _mm256_permute4x64_epi64(complexVal, 0b11000110);
-    iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal));
-    iFloatValue = _mm256_cvtepi32_ps(iIntVal);
-    iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
-    _mm256_store_ps(iBufferPtr, iFloatValue);
-    iBufferPtr += 8;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
-    complexVectorPtr++;
-  }
-
+    float* iBufferPtr = iBuffer;
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    __m256 iFloatValue;
+
+    const float iScalar = 1.0 / scalar;
+    __m256 invScalar = _mm256_set1_ps(iScalar);
+    __m256i complexVal, iIntVal;
+    int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+    __m256i moveMask = _mm256_set_epi8(0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0);
+    for (; number < sixteenthPoints; number++) {
+        complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
+
+        iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal));
+        iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+        iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+        _mm256_store_ps(iBufferPtr, iFloatValue);
+        iBufferPtr += 8;
+
+        complexVal = _mm256_permute4x64_epi64(complexVal, 0b11000110);
+        iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal));
+        iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+        iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+        _mm256_store_ps(iBufferPtr, iFloatValue);
+        iBufferPtr += 8;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -114,52 +143,55 @@ volk_8ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_8sc_t* compl
  #include <smmintrin.h>
  
  static inline void
-volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_8sc_t* complexVector,
-                                             const float scalar, unsigned int num_points)
+volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer,
+                                             const lv_8sc_t* complexVector,
+                                             const float scalar,
+                                             unsigned int num_points)
  {
-  float* iBufferPtr = iBuffer;
-
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;
-  __m128 iFloatValue;
+    float* iBufferPtr = iBuffer;
  
-  const float iScalar= 1.0 / scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-  __m128i complexVal, iIntVal;
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+    __m128 iFloatValue;
  
-  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    __m128i complexVal, iIntVal;
+    int8_t* complexVectorPtr = (int8_t*)complexVector;
  
-  for(;number < eighthPoints; number++){
-    complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
-    complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+    __m128i moveMask = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
  
-    iIntVal = _mm_cvtepi8_epi32(complexVal);
-    iFloatValue = _mm_cvtepi32_ps(iIntVal);
+    for (; number < eighthPoints; number++) {
+        complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        complexVal = _mm_shuffle_epi8(complexVal, moveMask);
  
-    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+        iIntVal = _mm_cvtepi8_epi32(complexVal);
+        iFloatValue = _mm_cvtepi32_ps(iIntVal);
  
-    _mm_store_ps(iBufferPtr, iFloatValue);
+        iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
  
-    iBufferPtr += 4;
+        _mm_store_ps(iBufferPtr, iFloatValue);
  
-    complexVal = _mm_srli_si128(complexVal, 4);
-    iIntVal = _mm_cvtepi8_epi32(complexVal);
-    iFloatValue = _mm_cvtepi32_ps(iIntVal);
+        iBufferPtr += 4;
  
-    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+        complexVal = _mm_srli_si128(complexVal, 4);
+        iIntVal = _mm_cvtepi8_epi32(complexVal);
+        iFloatValue = _mm_cvtepi32_ps(iIntVal);
  
-    _mm_store_ps(iBufferPtr, iFloatValue);
+        iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
  
-    iBufferPtr += 4;
-  }
+        _mm_store_ps(iBufferPtr, iFloatValue);
  
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
-    complexVectorPtr++;
-  }
+        iBufferPtr += 4;
+    }
  
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_SSE4_1 */
  
@@ -168,42 +200,47 @@ volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_8sc_t* com
  #include <xmmintrin.h>
  
  static inline void
-volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_8sc_t* complexVector,
-                                          const float scalar, unsigned int num_points)
+volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer,
+                                          const lv_8sc_t* complexVector,
+                                          const float scalar,
+                                          unsigned int num_points)
  {
-  float* iBufferPtr = iBuffer;
-
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-  __m128 iValue;
+    float* iBufferPtr = iBuffer;
  
-  const float iScalar= 1.0 / scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    __m128 iValue;
  
-  __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    int8_t* complexVectorPtr = (int8_t*)complexVector;
  
-  for(;number < quarterPoints; number++){
-    floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-    floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-    floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-    floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+    __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
  
-    iValue = _mm_load_ps(floatBuffer);
+    for (; number < quarterPoints; number++) {
+        floatBuffer[0] = (float)(*complexVectorPtr);
+        complexVectorPtr += 2;
+        floatBuffer[1] = (float)(*complexVectorPtr);
+        complexVectorPtr += 2;
+        floatBuffer[2] = (float)(*complexVectorPtr);
+        complexVectorPtr += 2;
+        floatBuffer[3] = (float)(*complexVectorPtr);
+        complexVectorPtr += 2;
  
-    iValue = _mm_mul_ps(iValue, invScalar);
+        iValue = _mm_load_ps(floatBuffer);
  
-    _mm_store_ps(iBufferPtr, iValue);
+        iValue = _mm_mul_ps(iValue, invScalar);
  
-    iBufferPtr += 4;
-  }
+        _mm_store_ps(iBufferPtr, iValue);
  
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
-    complexVectorPtr++;
-  }
+        iBufferPtr += 4;
+    }
  
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_SSE */
  
@@ -211,83 +248,117 @@ volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_8sc_t* comple
  #ifdef LV_HAVE_GENERIC
  
  static inline void
-volk_8ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_8sc_t* complexVector,
-                                            const float scalar, unsigned int num_points)
+volk_8ic_s32f_deinterleave_real_32f_generic(float* iBuffer,
+                                            const lv_8sc_t* complexVector,
+                                            const float scalar,
+                                            unsigned int num_points)
  {
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
-  float* iBufferPtr = iBuffer;
-  const float invScalar = 1.0 / scalar;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
-    complexVectorPtr++;
-  }
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+    float* iBufferPtr = iBuffer;
+    const float invScalar = 1.0 / scalar;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  
-
  #endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H */
  
  #ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_u_H
  #define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_u_H
  
-#include <volk/volk_common.h>
  #include <inttypes.h>
  #include <stdio.h>
+#include <volk/volk_common.h>
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
  static inline void
-volk_8ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer, const lv_8sc_t* complexVector,
-                                             const float scalar, unsigned int num_points)
+volk_8ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer,
+                                           const lv_8sc_t* complexVector,
+                                           const float scalar,
+                                           unsigned int num_points)
  {
-  float* iBufferPtr = iBuffer;
-
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-  __m256 iFloatValue;
-
-  const float iScalar= 1.0 / scalar;
-  __m256 invScalar = _mm256_set1_ps(iScalar);
-  __m256i complexVal, iIntVal;
-  __m128i hcomplexVal;
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
-
-  __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-
-  for(;number < sixteenthPoints; number++){
-    complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
-    complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
-
-    hcomplexVal = _mm256_extracti128_si256(complexVal,0);
-    iIntVal = _mm256_cvtepi8_epi32(hcomplexVal);
-    iFloatValue = _mm256_cvtepi32_ps(iIntVal);
-
-    iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
-
-    _mm256_storeu_ps(iBufferPtr, iFloatValue);
-
-    iBufferPtr += 8;
-
-    hcomplexVal = _mm256_extracti128_si256(complexVal,1);
-    iIntVal = _mm256_cvtepi8_epi32(hcomplexVal);
-    iFloatValue = _mm256_cvtepi32_ps(iIntVal);
-
-    iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
-
-    _mm256_storeu_ps(iBufferPtr, iFloatValue);
-
-    iBufferPtr += 8;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
-    complexVectorPtr++;
-  }
-
+    float* iBufferPtr = iBuffer;
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    __m256 iFloatValue;
+
+    const float iScalar = 1.0 / scalar;
+    __m256 invScalar = _mm256_set1_ps(iScalar);
+    __m256i complexVal, iIntVal;
+    __m128i hcomplexVal;
+    int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+    __m256i moveMask = _mm256_set_epi8(0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0);
+
+    for (; number < sixteenthPoints; number++) {
+        complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
+
+        hcomplexVal = _mm256_extracti128_si256(complexVal, 0);
+        iIntVal = _mm256_cvtepi8_epi32(hcomplexVal);
+        iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+
+        iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+
+        _mm256_storeu_ps(iBufferPtr, iFloatValue);
+
+        iBufferPtr += 8;
+
+        hcomplexVal = _mm256_extracti128_si256(complexVal, 1);
+        iIntVal = _mm256_cvtepi8_epi32(hcomplexVal);
+        iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+
+        iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+
+        _mm256_storeu_ps(iBufferPtr, iFloatValue);
+
+        iBufferPtr += 8;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+        complexVectorPtr++;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
diff --git a/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h b/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h

index 6762658276060b7f3c4793e35d3ee24f3a08fcc7..7f9fd96d02045d3b7e595ab34e69ffcaa40c104d 100644 (file)
--- a/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h
+++ b/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h
@@ -30,64 +30,73 @@
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  /*!
-  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
-  \param cVector The complex vector where the results will be stored
-  \param aVector One of the complex vectors to be multiplied
-  \param bVector The complex vector which will be converted to complex conjugate and multiplied
-  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  \brief Multiplys the one complex vector with the complex conjugate of the second complex
+  vector and stores their results in the third vector \param cVector The complex vector
+  where the results will be stored \param aVector One of the complex vectors to be
+  multiplied \param bVector The complex vector which will be converted to complex
+  conjugate and multiplied \param num_points The number of complex values in aVector and
+  bVector to be multiplied together and stored into cVector
  */
-static inline void volk_8ic_x2_multiply_conjugate_16ic_a_avx2(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 8;
-
-  __m256i x, y, realz, imagz;
-  lv_16sc_t* c = cVector;
-  const lv_8sc_t* a = aVector;
-  const lv_8sc_t* b = bVector;
-  __m256i conjugateSign = _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
-
-  for(;number < quarterPoints; number++){
-    // Convert 8 bit values into 16 bit values
-    x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
-    y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
-
-    // Calculate the ar*cr - ai*(-ci) portions
-    realz = _mm256_madd_epi16(x,y);
-
-    // Calculate the complex conjugate of the cr + ci j values
-    y = _mm256_sign_epi16(y, conjugateSign);
-
-    // Shift the order of the cr and ci values
-    y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1));
-
-    // Calculate the ar*(-ci) + cr*(ai)
-    imagz = _mm256_madd_epi16(x,y);
-
-    // Perform the addition of products
-
-    _mm256_store_si256((__m256i*)c, _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz), _mm256_unpackhi_epi32(realz, imagz)));
-
-    a += 8;
-    b += 8;
-    c += 8;
-  }
-
-  number = quarterPoints * 8;
-  int16_t* c16Ptr = (int16_t*)&cVector[number];
-  int8_t* a8Ptr = (int8_t*)&aVector[number];
-  int8_t* b8Ptr = (int8_t*)&bVector[number];
-  for(; number < num_points; number++){
-    float aReal =  (float)*a8Ptr++;
-    float aImag =  (float)*a8Ptr++;
-    lv_32fc_t aVal = lv_cmake(aReal, aImag );
-    float bReal = (float)*b8Ptr++;
-    float bImag = (float)*b8Ptr++;
-    lv_32fc_t bVal = lv_cmake( bReal, -bImag );
-    lv_32fc_t temp = aVal * bVal;
-
-    *c16Ptr++ = (int16_t)lv_creal(temp);
-    *c16Ptr++ = (int16_t)lv_cimag(temp);
-  }
+static inline void volk_8ic_x2_multiply_conjugate_16ic_a_avx2(lv_16sc_t* cVector,
+                                                              const lv_8sc_t* aVector,
+                                                              const lv_8sc_t* bVector,
+                                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 8;
+
+    __m256i x, y, realz, imagz;
+    lv_16sc_t* c = cVector;
+    const lv_8sc_t* a = aVector;
+    const lv_8sc_t* b = bVector;
+    __m256i conjugateSign =
+        _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
+
+    for (; number < quarterPoints; number++) {
+        // Convert 8 bit values into 16 bit values
+        x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
+        y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
+
+        // Calculate the ar*cr - ai*(-ci) portions
+        realz = _mm256_madd_epi16(x, y);
+
+        // Calculate the complex conjugate of the cr + ci j values
+        y = _mm256_sign_epi16(y, conjugateSign);
+
+        // Shift the order of the cr and ci values
+        y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
+                                   _MM_SHUFFLE(2, 3, 0, 1));
+
+        // Calculate the ar*(-ci) + cr*(ai)
+        imagz = _mm256_madd_epi16(x, y);
+
+        // Perform the addition of products
+
+        _mm256_store_si256((__m256i*)c,
+                           _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz),
+                                              _mm256_unpackhi_epi32(realz, imagz)));
+
+        a += 8;
+        b += 8;
+        c += 8;
+    }
+
+    number = quarterPoints * 8;
+    int16_t* c16Ptr = (int16_t*)&cVector[number];
+    int8_t* a8Ptr = (int8_t*)&aVector[number];
+    int8_t* b8Ptr = (int8_t*)&bVector[number];
+    for (; number < num_points; number++) {
+        float aReal = (float)*a8Ptr++;
+        float aImag = (float)*a8Ptr++;
+        lv_32fc_t aVal = lv_cmake(aReal, aImag);
+        float bReal = (float)*b8Ptr++;
+        float bImag = (float)*b8Ptr++;
+        lv_32fc_t bVal = lv_cmake(bReal, -bImag);
+        lv_32fc_t temp = aVal * bVal;
+
+        *c16Ptr++ = (int16_t)lv_creal(temp);
+        *c16Ptr++ = (int16_t)lv_cimag(temp);
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -95,90 +104,103 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a_avx2(lv_16sc_t* cVector
  #ifdef LV_HAVE_SSE4_1
  #include <smmintrin.h>
  /*!
-  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
-  \param cVector The complex vector where the results will be stored
-  \param aVector One of the complex vectors to be multiplied
-  \param bVector The complex vector which will be converted to complex conjugate and multiplied
-  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  \brief Multiplys the one complex vector with the complex conjugate of the second complex
+  vector and stores their results in the third vector \param cVector The complex vector
+  where the results will be stored \param aVector One of the complex vectors to be
+  multiplied \param bVector The complex vector which will be converted to complex
+  conjugate and multiplied \param num_points The number of complex values in aVector and
+  bVector to be multiplied together and stored into cVector
  */
-static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  __m128i x, y, realz, imagz;
-  lv_16sc_t* c = cVector;
-  const lv_8sc_t* a = aVector;
-  const lv_8sc_t* b = bVector;
-  __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
-
-  for(;number < quarterPoints; number++){
-    // Convert into 8 bit values into 16 bit values
-    x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
-    y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
-
-    // Calculate the ar*cr - ai*(-ci) portions
-    realz = _mm_madd_epi16(x,y);
-
-    // Calculate the complex conjugate of the cr + ci j values
-    y = _mm_sign_epi16(y, conjugateSign);
-
-    // Shift the order of the cr and ci values
-    y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1));
-
-    // Calculate the ar*(-ci) + cr*(ai)
-    imagz = _mm_madd_epi16(x,y);
-
-    _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz)));
-
-    a += 4;
-    b += 4;
-    c += 4;
-  }
-
-  number = quarterPoints * 4;
-  int16_t* c16Ptr = (int16_t*)&cVector[number];
-  int8_t* a8Ptr = (int8_t*)&aVector[number];
-  int8_t* b8Ptr = (int8_t*)&bVector[number];
-  for(; number < num_points; number++){
-    float aReal =  (float)*a8Ptr++;
-    float aImag =  (float)*a8Ptr++;
-    lv_32fc_t aVal = lv_cmake(aReal, aImag );
-    float bReal = (float)*b8Ptr++;
-    float bImag = (float)*b8Ptr++;
-    lv_32fc_t bVal = lv_cmake( bReal, -bImag );
-    lv_32fc_t temp = aVal * bVal;
-
-    *c16Ptr++ = (int16_t)lv_creal(temp);
-    *c16Ptr++ = (int16_t)lv_cimag(temp);
-  }
+static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVector,
+                                                                const lv_8sc_t* aVector,
+                                                                const lv_8sc_t* bVector,
+                                                                unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m128i x, y, realz, imagz;
+    lv_16sc_t* c = cVector;
+    const lv_8sc_t* a = aVector;
+    const lv_8sc_t* b = bVector;
+    __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
+
+    for (; number < quarterPoints; number++) {
+        // Convert into 8 bit values into 16 bit values
+        x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
+        y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
+
+        // Calculate the ar*cr - ai*(-ci) portions
+        realz = _mm_madd_epi16(x, y);
+
+        // Calculate the complex conjugate of the cr + ci j values
+        y = _mm_sign_epi16(y, conjugateSign);
+
+        // Shift the order of the cr and ci values
+        y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
+                                _MM_SHUFFLE(2, 3, 0, 1));
+
+        // Calculate the ar*(-ci) + cr*(ai)
+        imagz = _mm_madd_epi16(x, y);
+
+        _mm_store_si128((__m128i*)c,
+                        _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz),
+                                        _mm_unpackhi_epi32(realz, imagz)));
+
+        a += 4;
+        b += 4;
+        c += 4;
+    }
+
+    number = quarterPoints * 4;
+    int16_t* c16Ptr = (int16_t*)&cVector[number];
+    int8_t* a8Ptr = (int8_t*)&aVector[number];
+    int8_t* b8Ptr = (int8_t*)&bVector[number];
+    for (; number < num_points; number++) {
+        float aReal = (float)*a8Ptr++;
+        float aImag = (float)*a8Ptr++;
+        lv_32fc_t aVal = lv_cmake(aReal, aImag);
+        float bReal = (float)*b8Ptr++;
+        float bImag = (float)*b8Ptr++;
+        lv_32fc_t bVal = lv_cmake(bReal, -bImag);
+        lv_32fc_t temp = aVal * bVal;
+
+        *c16Ptr++ = (int16_t)lv_creal(temp);
+        *c16Ptr++ = (int16_t)lv_cimag(temp);
+    }
  }
  #endif /* LV_HAVE_SSE4_1 */
  
  #ifdef LV_HAVE_GENERIC
  /*!
-  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
-  \param cVector The complex vector where the results will be stored
-  \param aVector One of the complex vectors to be multiplied
-  \param bVector The complex vector which will be converted to complex conjugate and multiplied
-  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  \brief Multiplys the one complex vector with the complex conjugate of the second complex
+  vector and stores their results in the third vector \param cVector The complex vector
+  where the results will be stored \param aVector One of the complex vectors to be
+  multiplied \param bVector The complex vector which will be converted to complex
+  conjugate and multiplied \param num_points The number of complex values in aVector and
+  bVector to be multiplied together and stored into cVector
  */
-static inline void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
-  unsigned int number = 0;
-  int16_t* c16Ptr = (int16_t*)cVector;
-  int8_t* a8Ptr = (int8_t*)aVector;
-  int8_t* b8Ptr = (int8_t*)bVector;
-  for(number =0; number < num_points; number++){
-    float aReal =  (float)*a8Ptr++;
-    float aImag =  (float)*a8Ptr++;
-    lv_32fc_t aVal = lv_cmake(aReal, aImag );
-    float bReal = (float)*b8Ptr++;
-    float bImag = (float)*b8Ptr++;
-    lv_32fc_t bVal = lv_cmake( bReal, -bImag );
-    lv_32fc_t temp = aVal * bVal;
-
-    *c16Ptr++ = (int16_t)lv_creal(temp);
-    *c16Ptr++ = (int16_t)lv_cimag(temp);
-  }
+static inline void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t* cVector,
+                                                               const lv_8sc_t* aVector,
+                                                               const lv_8sc_t* bVector,
+                                                               unsigned int num_points)
+{
+    unsigned int number = 0;
+    int16_t* c16Ptr = (int16_t*)cVector;
+    int8_t* a8Ptr = (int8_t*)aVector;
+    int8_t* b8Ptr = (int8_t*)bVector;
+    for (number = 0; number < num_points; number++) {
+        float aReal = (float)*a8Ptr++;
+        float aImag = (float)*a8Ptr++;
+        lv_32fc_t aVal = lv_cmake(aReal, aImag);
+        float bReal = (float)*b8Ptr++;
+        float bImag = (float)*b8Ptr++;
+        lv_32fc_t bVal = lv_cmake(bReal, -bImag);
+        lv_32fc_t temp = aVal * bVal;
+
+        *c16Ptr++ = (int16_t)lv_creal(temp);
+        *c16Ptr++ = (int16_t)lv_cimag(temp);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -194,64 +216,73 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t* cVecto
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  /*!
-  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
-  \param cVector The complex vector where the results will be stored
-  \param aVector One of the complex vectors to be multiplied
-  \param bVector The complex vector which will be converted to complex conjugate and multiplied
-  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  \brief Multiplys the one complex vector with the complex conjugate of the second complex
+  vector and stores their results in the third vector \param cVector The complex vector
+  where the results will be stored \param aVector One of the complex vectors to be
+  multiplied \param bVector The complex vector which will be converted to complex
+  conjugate and multiplied \param num_points The number of complex values in aVector and
+  bVector to be multiplied together and stored into cVector
  */
-static inline void volk_8ic_x2_multiply_conjugate_16ic_u_avx2(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int oneEigthPoints = num_points / 8;
-
-  __m256i x, y, realz, imagz;
-  lv_16sc_t* c = cVector;
-  const lv_8sc_t* a = aVector;
-  const lv_8sc_t* b = bVector;
-  __m256i conjugateSign = _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
-
-  for(;number < oneEigthPoints; number++){
-    // Convert 8 bit values into 16 bit values
-    x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
-    y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
-
-    // Calculate the ar*cr - ai*(-ci) portions
-    realz = _mm256_madd_epi16(x,y);
-
-    // Calculate the complex conjugate of the cr + ci j values
-    y = _mm256_sign_epi16(y, conjugateSign);
-
-    // Shift the order of the cr and ci values
-    y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1));
-
-    // Calculate the ar*(-ci) + cr*(ai)
-    imagz = _mm256_madd_epi16(x,y);
-
-    // Perform the addition of products
-
-    _mm256_storeu_si256((__m256i*)c, _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz), _mm256_unpackhi_epi32(realz, imagz)));
-
-    a += 8;
-    b += 8;
-    c += 8;
-  }
-
-  number = oneEigthPoints * 8;
-  int16_t* c16Ptr = (int16_t*)&cVector[number];
-  int8_t* a8Ptr = (int8_t*)&aVector[number];
-  int8_t* b8Ptr = (int8_t*)&bVector[number];
-  for(; number < num_points; number++){
-    float aReal =  (float)*a8Ptr++;
-    float aImag =  (float)*a8Ptr++;
-    lv_32fc_t aVal = lv_cmake(aReal, aImag );
-    float bReal = (float)*b8Ptr++;
-    float bImag = (float)*b8Ptr++;
-    lv_32fc_t bVal = lv_cmake( bReal, -bImag );
-    lv_32fc_t temp = aVal * bVal;
-
-    *c16Ptr++ = (int16_t)lv_creal(temp);
-    *c16Ptr++ = (int16_t)lv_cimag(temp);
-  }
+static inline void volk_8ic_x2_multiply_conjugate_16ic_u_avx2(lv_16sc_t* cVector,
+                                                              const lv_8sc_t* aVector,
+                                                              const lv_8sc_t* bVector,
+                                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int oneEigthPoints = num_points / 8;
+
+    __m256i x, y, realz, imagz;
+    lv_16sc_t* c = cVector;
+    const lv_8sc_t* a = aVector;
+    const lv_8sc_t* b = bVector;
+    __m256i conjugateSign =
+        _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
+
+    for (; number < oneEigthPoints; number++) {
+        // Convert 8 bit values into 16 bit values
+        x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
+        y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
+
+        // Calculate the ar*cr - ai*(-ci) portions
+        realz = _mm256_madd_epi16(x, y);
+
+        // Calculate the complex conjugate of the cr + ci j values
+        y = _mm256_sign_epi16(y, conjugateSign);
+
+        // Shift the order of the cr and ci values
+        y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
+                                   _MM_SHUFFLE(2, 3, 0, 1));
+
+        // Calculate the ar*(-ci) + cr*(ai)
+        imagz = _mm256_madd_epi16(x, y);
+
+        // Perform the addition of products
+
+        _mm256_storeu_si256((__m256i*)c,
+                            _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz),
+                                               _mm256_unpackhi_epi32(realz, imagz)));
+
+        a += 8;
+        b += 8;
+        c += 8;
+    }
+
+    number = oneEigthPoints * 8;
+    int16_t* c16Ptr = (int16_t*)&cVector[number];
+    int8_t* a8Ptr = (int8_t*)&aVector[number];
+    int8_t* b8Ptr = (int8_t*)&bVector[number];
+    for (; number < num_points; number++) {
+        float aReal = (float)*a8Ptr++;
+        float aImag = (float)*a8Ptr++;
+        lv_32fc_t aVal = lv_cmake(aReal, aImag);
+        float bReal = (float)*b8Ptr++;
+        float bImag = (float)*b8Ptr++;
+        lv_32fc_t bVal = lv_cmake(bReal, -bImag);
+        lv_32fc_t temp = aVal * bVal;
+
+        *c16Ptr++ = (int16_t)lv_creal(temp);
+        *c16Ptr++ = (int16_t)lv_cimag(temp);
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
diff --git a/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h b/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h

index 82e40c8576d9bd83c64f20127fad5f32557edede..db6bd7a1ccd5bc7e513bfc6f6f7e5b17da992c52 100644 (file)
--- a/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h
+++ b/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h
@@ -30,14 +30,15 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_8ic_x2_s32f_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points)
- * \endcode
+ * void volk_8ic_x2_s32f_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_8sc_t*
+ * aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points) \endcode
   *
   * \b Inputs
   * \li aVector: One of the complex vectors to be multiplied.
- * \li bVector: The complex vector which will be converted to complex conjugate and multiplied.
- * \li scalar: each output value is scaled by 1/scalar.
- * \li num_points: The number of complex values in aVector and bVector to be multiplied together and stored into cVector.
+ * \li bVector: The complex vector which will be converted to complex conjugate and
+ * multiplied. \li scalar: each output value is scaled by 1/scalar. \li num_points: The
+ * number of complex values in aVector and bVector to be multiplied together and stored
+ * into cVector.
   *
   * \b Outputs
   * \li cVector: The complex vector where the results will be stored.
@@ -64,160 +65,167 @@
  #include <immintrin.h>
  
  static inline void
-volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(lv_32fc_t* cVector, const lv_8sc_t* aVector,
-                                                  const lv_8sc_t* bVector, const float scalar,
-                                                  unsigned int num_points)
+volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(lv_32fc_t* cVector,
+                                                const lv_8sc_t* aVector,
+                                                const lv_8sc_t* bVector,
+                                                const float scalar,
+                                                unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int oneEigthPoints = num_points / 8;
-
-  __m256i x, y, realz, imagz;
-  __m256 ret, retlo, rethi;
-  lv_32fc_t* c = cVector;
-  const lv_8sc_t* a = aVector;
-  const lv_8sc_t* b = bVector;
-  __m256i conjugateSign = _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
-
-  __m256 invScalar = _mm256_set1_ps(1.0/scalar);
-
-  for(;number < oneEigthPoints; number++){
-    // Convert  8 bit values into 16 bit values
-    x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
-    y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
-
-    // Calculate the ar*cr - ai*(-ci) portions
-    realz = _mm256_madd_epi16(x,y);
-
-    // Calculate the complex conjugate of the cr + ci j values
-    y = _mm256_sign_epi16(y, conjugateSign);
-
-    // Shift the order of the cr and ci values
-    y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1));
-
-    // Calculate the ar*(-ci) + cr*(ai)
-    imagz = _mm256_madd_epi16(x,y);
-
-    // Interleave real and imaginary and then convert to float values
-    retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
-
-    // Normalize the floating point values
-    retlo = _mm256_mul_ps(retlo, invScalar);
-
-    // Interleave real and imaginary and then convert to float values
-    rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
-
-    // Normalize the floating point values
-    rethi = _mm256_mul_ps(rethi, invScalar);
-
-    ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
-    _mm256_store_ps((float*)c, ret);
-    c += 4;
-
-    ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
-    _mm256_store_ps((float*)c, ret);
-    c += 4;
-
-    a += 8;
-    b += 8;
-  }
-
-  number = oneEigthPoints * 8;
-  float* cFloatPtr = (float*)&cVector[number];
-  int8_t* a8Ptr = (int8_t*)&aVector[number];
-  int8_t* b8Ptr = (int8_t*)&bVector[number];
-  for(; number < num_points; number++){
-    float aReal =  (float)*a8Ptr++;
-    float aImag =  (float)*a8Ptr++;
-    lv_32fc_t aVal = lv_cmake(aReal, aImag );
-    float bReal = (float)*b8Ptr++;
-    float bImag = (float)*b8Ptr++;
-    lv_32fc_t bVal = lv_cmake( bReal, -bImag );
-    lv_32fc_t temp = aVal * bVal;
-
-    *cFloatPtr++ = lv_creal(temp) / scalar;
-    *cFloatPtr++ = lv_cimag(temp) / scalar;
-  }
+    unsigned int number = 0;
+    const unsigned int oneEigthPoints = num_points / 8;
+
+    __m256i x, y, realz, imagz;
+    __m256 ret, retlo, rethi;
+    lv_32fc_t* c = cVector;
+    const lv_8sc_t* a = aVector;
+    const lv_8sc_t* b = bVector;
+    __m256i conjugateSign =
+        _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
+
+    __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
+
+    for (; number < oneEigthPoints; number++) {
+        // Convert  8 bit values into 16 bit values
+        x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
+        y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
+
+        // Calculate the ar*cr - ai*(-ci) portions
+        realz = _mm256_madd_epi16(x, y);
+
+        // Calculate the complex conjugate of the cr + ci j values
+        y = _mm256_sign_epi16(y, conjugateSign);
+
+        // Shift the order of the cr and ci values
+        y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
+                                   _MM_SHUFFLE(2, 3, 0, 1));
+
+        // Calculate the ar*(-ci) + cr*(ai)
+        imagz = _mm256_madd_epi16(x, y);
+
+        // Interleave real and imaginary and then convert to float values
+        retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
+
+        // Normalize the floating point values
+        retlo = _mm256_mul_ps(retlo, invScalar);
+
+        // Interleave real and imaginary and then convert to float values
+        rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
+
+        // Normalize the floating point values
+        rethi = _mm256_mul_ps(rethi, invScalar);
+
+        ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
+        _mm256_store_ps((float*)c, ret);
+        c += 4;
+
+        ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
+        _mm256_store_ps((float*)c, ret);
+        c += 4;
+
+        a += 8;
+        b += 8;
+    }
+
+    number = oneEigthPoints * 8;
+    float* cFloatPtr = (float*)&cVector[number];
+    int8_t* a8Ptr = (int8_t*)&aVector[number];
+    int8_t* b8Ptr = (int8_t*)&bVector[number];
+    for (; number < num_points; number++) {
+        float aReal = (float)*a8Ptr++;
+        float aImag = (float)*a8Ptr++;
+        lv_32fc_t aVal = lv_cmake(aReal, aImag);
+        float bReal = (float)*b8Ptr++;
+        float bImag = (float)*b8Ptr++;
+        lv_32fc_t bVal = lv_cmake(bReal, -bImag);
+        lv_32fc_t temp = aVal * bVal;
+
+        *cFloatPtr++ = lv_creal(temp) / scalar;
+        *cFloatPtr++ = lv_cimag(temp) / scalar;
+    }
  }
-#endif  /* LV_HAVE_AVX2*/
+#endif /* LV_HAVE_AVX2*/
  
  
  #ifdef LV_HAVE_SSE4_1
  #include <smmintrin.h>
  
  static inline void
-volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector,
-                                                  const lv_8sc_t* bVector, const float scalar,
+volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector,
+                                                  const lv_8sc_t* aVector,
+                                                  const lv_8sc_t* bVector,
+                                                  const float scalar,
                                                    unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  __m128i x, y, realz, imagz;
-  __m128 ret;
-  lv_32fc_t* c = cVector;
-  const lv_8sc_t* a = aVector;
-  const lv_8sc_t* b = bVector;
-  __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
-
-  __m128 invScalar = _mm_set_ps1(1.0/scalar);
-
-  for(;number < quarterPoints; number++){
-    // Convert into 8 bit values into 16 bit values
-    x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
-    y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
-
-    // Calculate the ar*cr - ai*(-ci) portions
-    realz = _mm_madd_epi16(x,y);
-
-    // Calculate the complex conjugate of the cr + ci j values
-    y = _mm_sign_epi16(y, conjugateSign);
-
-    // Shift the order of the cr and ci values
-    y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1));
-
-    // Calculate the ar*(-ci) + cr*(ai)
-    imagz = _mm_madd_epi16(x,y);
-
-    // Interleave real and imaginary and then convert to float values
-    ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
-
-    // Normalize the floating point values
-    ret = _mm_mul_ps(ret, invScalar);
-
-    // Store the floating point values
-    _mm_store_ps((float*)c, ret);
-    c += 2;
-
-    // Interleave real and imaginary and then convert to float values
-    ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
-
-    // Normalize the floating point values
-    ret = _mm_mul_ps(ret, invScalar);
-
-    // Store the floating point values
-    _mm_store_ps((float*)c, ret);
-    c += 2;
-
-    a += 4;
-    b += 4;
-  }
-
-  number = quarterPoints * 4;
-  float* cFloatPtr = (float*)&cVector[number];
-  int8_t* a8Ptr = (int8_t*)&aVector[number];
-  int8_t* b8Ptr = (int8_t*)&bVector[number];
-  for(; number < num_points; number++){
-    float aReal =  (float)*a8Ptr++;
-    float aImag =  (float)*a8Ptr++;
-    lv_32fc_t aVal = lv_cmake(aReal, aImag );
-    float bReal = (float)*b8Ptr++;
-    float bImag = (float)*b8Ptr++;
-    lv_32fc_t bVal = lv_cmake( bReal, -bImag );
-    lv_32fc_t temp = aVal * bVal;
-
-    *cFloatPtr++ = lv_creal(temp) / scalar;
-    *cFloatPtr++ = lv_cimag(temp) / scalar;
-  }
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m128i x, y, realz, imagz;
+    __m128 ret;
+    lv_32fc_t* c = cVector;
+    const lv_8sc_t* a = aVector;
+    const lv_8sc_t* b = bVector;
+    __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
+
+    __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+
+    for (; number < quarterPoints; number++) {
+        // Convert into 8 bit values into 16 bit values
+        x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
+        y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
+
+        // Calculate the ar*cr - ai*(-ci) portions
+        realz = _mm_madd_epi16(x, y);
+
+        // Calculate the complex conjugate of the cr + ci j values
+        y = _mm_sign_epi16(y, conjugateSign);
+
+        // Shift the order of the cr and ci values
+        y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
+                                _MM_SHUFFLE(2, 3, 0, 1));
+
+        // Calculate the ar*(-ci) + cr*(ai)
+        imagz = _mm_madd_epi16(x, y);
+
+        // Interleave real and imaginary and then convert to float values
+        ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
+
+        // Normalize the floating point values
+        ret = _mm_mul_ps(ret, invScalar);
+
+        // Store the floating point values
+        _mm_store_ps((float*)c, ret);
+        c += 2;
+
+        // Interleave real and imaginary and then convert to float values
+        ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
+
+        // Normalize the floating point values
+        ret = _mm_mul_ps(ret, invScalar);
+
+        // Store the floating point values
+        _mm_store_ps((float*)c, ret);
+        c += 2;
+
+        a += 4;
+        b += 4;
+    }
+
+    number = quarterPoints * 4;
+    float* cFloatPtr = (float*)&cVector[number];
+    int8_t* a8Ptr = (int8_t*)&aVector[number];
+    int8_t* b8Ptr = (int8_t*)&bVector[number];
+    for (; number < num_points; number++) {
+        float aReal = (float)*a8Ptr++;
+        float aImag = (float)*a8Ptr++;
+        lv_32fc_t aVal = lv_cmake(aReal, aImag);
+        float bReal = (float)*b8Ptr++;
+        float bImag = (float)*b8Ptr++;
+        lv_32fc_t bVal = lv_cmake(bReal, -bImag);
+        lv_32fc_t temp = aVal * bVal;
+
+        *cFloatPtr++ = lv_creal(temp) / scalar;
+        *cFloatPtr++ = lv_cimag(temp) / scalar;
+    }
  }
  #endif /* LV_HAVE_SSE4_1 */
  
@@ -225,27 +233,29 @@ volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector, const lv_8
  #ifdef LV_HAVE_GENERIC
  
  static inline void
-volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector,
-                                                 const lv_8sc_t* bVector, const float scalar,
+volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector,
+                                                 const lv_8sc_t* aVector,
+                                                 const lv_8sc_t* bVector,
+                                                 const float scalar,
                                                   unsigned int num_points)
  {
-  unsigned int number = 0;
-  float* cPtr = (float*)cVector;
-  const float invScalar = 1.0 / scalar;
-  int8_t* a8Ptr = (int8_t*)aVector;
-  int8_t* b8Ptr = (int8_t*)bVector;
-  for(number = 0; number < num_points; number++){
-    float aReal =  (float)*a8Ptr++;
-    float aImag =  (float)*a8Ptr++;
-    lv_32fc_t aVal = lv_cmake(aReal, aImag );
-    float bReal = (float)*b8Ptr++;
-    float bImag = (float)*b8Ptr++;
-    lv_32fc_t bVal = lv_cmake( bReal, -bImag );
-    lv_32fc_t temp = aVal * bVal;
-
-    *cPtr++ = (lv_creal(temp) * invScalar);
-    *cPtr++ = (lv_cimag(temp) * invScalar);
-  }
+    unsigned int number = 0;
+    float* cPtr = (float*)cVector;
+    const float invScalar = 1.0 / scalar;
+    int8_t* a8Ptr = (int8_t*)aVector;
+    int8_t* b8Ptr = (int8_t*)bVector;
+    for (number = 0; number < num_points; number++) {
+        float aReal = (float)*a8Ptr++;
+        float aImag = (float)*a8Ptr++;
+        lv_32fc_t aVal = lv_cmake(aReal, aImag);
+        float bReal = (float)*b8Ptr++;
+        float bImag = (float)*b8Ptr++;
+        lv_32fc_t bVal = lv_cmake(bReal, -bImag);
+        lv_32fc_t temp = aVal * bVal;
+
+        *cPtr++ = (lv_creal(temp) * invScalar);
+        *cPtr++ = (lv_cimag(temp) * invScalar);
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -263,81 +273,85 @@ volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_8s
  #include <immintrin.h>
  
  static inline void
-volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector, const lv_8sc_t* aVector,
-                                                  const lv_8sc_t* bVector, const float scalar,
-                                                  unsigned int num_points)
+volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector,
+                                                const lv_8sc_t* aVector,
+                                                const lv_8sc_t* bVector,
+                                                const float scalar,
+                                                unsigned int num_points)
  {
-  unsigned int number = 0;
-  const unsigned int oneEigthPoints = num_points / 8;
-
-  __m256i x, y, realz, imagz;
-  __m256 ret, retlo, rethi;
-  lv_32fc_t* c = cVector;
-  const lv_8sc_t* a = aVector;
-  const lv_8sc_t* b = bVector;
-  __m256i conjugateSign = _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
-
-  __m256 invScalar = _mm256_set1_ps(1.0/scalar);
-
-  for(;number < oneEigthPoints; number++){
-    // Convert  8 bit values into 16 bit values
-    x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
-    y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
-
-    // Calculate the ar*cr - ai*(-ci) portions
-    realz = _mm256_madd_epi16(x,y);
-
-    // Calculate the complex conjugate of the cr + ci j values
-    y = _mm256_sign_epi16(y, conjugateSign);
-
-    // Shift the order of the cr and ci values
-    y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1));
-
-    // Calculate the ar*(-ci) + cr*(ai)
-    imagz = _mm256_madd_epi16(x,y);
-
-    // Interleave real and imaginary and then convert to float values
-    retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
-
-    // Normalize the floating point values
-    retlo = _mm256_mul_ps(retlo, invScalar);
-
-    // Interleave real and imaginary and then convert to float values
-    rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
-
-    // Normalize the floating point values
-    rethi = _mm256_mul_ps(rethi, invScalar);
-
-    ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
-    _mm256_storeu_ps((float*)c, ret);
-    c += 4;
-
-    ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
-    _mm256_storeu_ps((float*)c, ret);
-    c += 4;
-
-    a += 8;
-    b += 8;
-  }
-
-  number = oneEigthPoints * 8;
-  float* cFloatPtr = (float*)&cVector[number];
-  int8_t* a8Ptr = (int8_t*)&aVector[number];
-  int8_t* b8Ptr = (int8_t*)&bVector[number];
-  for(; number < num_points; number++){
-    float aReal =  (float)*a8Ptr++;
-    float aImag =  (float)*a8Ptr++;
-    lv_32fc_t aVal = lv_cmake(aReal, aImag );
-    float bReal = (float)*b8Ptr++;
-    float bImag = (float)*b8Ptr++;
-    lv_32fc_t bVal = lv_cmake( bReal, -bImag );
-    lv_32fc_t temp = aVal * bVal;
-
-    *cFloatPtr++ = lv_creal(temp) / scalar;
-    *cFloatPtr++ = lv_cimag(temp) / scalar;
-  }
+    unsigned int number = 0;
+    const unsigned int oneEigthPoints = num_points / 8;
+
+    __m256i x, y, realz, imagz;
+    __m256 ret, retlo, rethi;
+    lv_32fc_t* c = cVector;
+    const lv_8sc_t* a = aVector;
+    const lv_8sc_t* b = bVector;
+    __m256i conjugateSign =
+        _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
+
+    __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
+
+    for (; number < oneEigthPoints; number++) {
+        // Convert  8 bit values into 16 bit values
+        x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
+        y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
+
+        // Calculate the ar*cr - ai*(-ci) portions
+        realz = _mm256_madd_epi16(x, y);
+
+        // Calculate the complex conjugate of the cr + ci j values
+        y = _mm256_sign_epi16(y, conjugateSign);
+
+        // Shift the order of the cr and ci values
+        y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
+                                   _MM_SHUFFLE(2, 3, 0, 1));
+
+        // Calculate the ar*(-ci) + cr*(ai)
+        imagz = _mm256_madd_epi16(x, y);
+
+        // Interleave real and imaginary and then convert to float values
+        retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
+
+        // Normalize the floating point values
+        retlo = _mm256_mul_ps(retlo, invScalar);
+
+        // Interleave real and imaginary and then convert to float values
+        rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
+
+        // Normalize the floating point values
+        rethi = _mm256_mul_ps(rethi, invScalar);
+
+        ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
+        _mm256_storeu_ps((float*)c, ret);
+        c += 4;
+
+        ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
+        _mm256_storeu_ps((float*)c, ret);
+        c += 4;
+
+        a += 8;
+        b += 8;
+    }
+
+    number = oneEigthPoints * 8;
+    float* cFloatPtr = (float*)&cVector[number];
+    int8_t* a8Ptr = (int8_t*)&aVector[number];
+    int8_t* b8Ptr = (int8_t*)&bVector[number];
+    for (; number < num_points; number++) {
+        float aReal = (float)*a8Ptr++;
+        float aImag = (float)*a8Ptr++;
+        lv_32fc_t aVal = lv_cmake(aReal, aImag);
+        float bReal = (float)*b8Ptr++;
+        float bImag = (float)*b8Ptr++;
+        lv_32fc_t bVal = lv_cmake(bReal, -bImag);
+        lv_32fc_t temp = aVal * bVal;
+
+        *cFloatPtr++ = lv_creal(temp) / scalar;
+        *cFloatPtr++ = lv_cimag(temp) / scalar;
+    }
  }
-#endif  /* LV_HAVE_AVX2*/
+#endif /* LV_HAVE_AVX2*/
  
  
  #endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H */
diff --git a/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h

index 00f83de5293b062d9d5a296ec799b6becefbb6db..69287cdc4319df3c86340c2c8bcbe04aadb5a0b3 100644 (file)
--- a/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h
+++ b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h
@@ -23,21 +23,21 @@
  #ifndef INCLUDED_volk_8u_conv_k7_r2puppet_8u_H
  #define INCLUDED_volk_8u_conv_k7_r2puppet_8u_H
  
+#include <string.h>
  #include <volk/volk.h>
  #include <volk/volk_8u_x4_conv_k7_r2_8u.h>
-#include <string.h>
  
  typedef union {
-  //decision_t is a BIT vector
-  unsigned char* t;
-  unsigned int* w;
+    // decision_t is a BIT vector
+    unsigned char* t;
+    unsigned int* w;
  } p_decision_t;
  
  static inline int parity(int x, unsigned char* Partab)
  {
-  x ^= (x >> 16);
-  x ^= (x >> 8);
-  return Partab[x];
+    x ^= (x >> 16);
+    x ^= (x >> 8);
+    return Partab[x];
  }
  
  static inline int chainback_viterbi(unsigned char* data,
@@ -46,135 +46,143 @@ static inline int chainback_viterbi(unsigned char* data,
                                      unsigned int tailsize,
                                      unsigned char* decisions)
  {
-  unsigned char* d;
-  int d_ADDSHIFT = 0;
-  int d_numstates = (1 << 6);
-  int d_decision_t_size = d_numstates/8;
-  unsigned int d_k = 7;
-  int d_framebits = nbits;
-  /* ADDSHIFT and SUBSHIFT make sure that the thing returned is a byte. */
-  d = decisions;
-  /* Make room beyond the end of the encoder register so we can
-   * accumulate a full byte of decoded data
-   */
-
-  endstate = (endstate%d_numstates) << d_ADDSHIFT;
-
-  /* The store into data[] only needs to be done every 8 bits.
-   * But this avoids a conditional branch, and the writes will
-   * combine in the cache anyway
-   */
-
-  d += tailsize * d_decision_t_size ; /* Look past tail */
-  int retval;
-  int dif = tailsize - (d_k - 1);
-  //printf("break, %d, %d\n", dif, (nbits+dif)%d_framebits);
-  p_decision_t dec;
-  while(nbits-- > d_framebits - (d_k - 1)) {
-    int k;
-    dec.t =  &d[nbits * d_decision_t_size];
-    k = (dec.w[(endstate>>d_ADDSHIFT)/32] >> ((endstate>>d_ADDSHIFT)%32)) & 1;
-
-    endstate = (endstate >> 1) | (k << (d_k-2+d_ADDSHIFT));
-    //data[((nbits+dif)%nbits)>>3] = endstate>>d_SUBSHIFT;
-    //printf("%d, %d\n", k, (nbits+dif)%d_framebits);
-    data[((nbits+dif)%d_framebits)] = k;
-
-    retval = endstate;
-  }
-  nbits += 1;
-
-  while(nbits-- != 0) {
-    int k;
-
-    dec.t = &d[nbits * d_decision_t_size];
-
-    k = (dec.w[(endstate>>d_ADDSHIFT)/32] >> ((endstate>>d_ADDSHIFT)%32)) & 1;
-
-    endstate = (endstate >> 1) | (k << (d_k-2+d_ADDSHIFT));
-    data[((nbits+dif)%d_framebits)] = k;
-  }
-  //printf("%d, %d, %d, %d, %d, %d, %d, %d\n", data[4095],data[4094],data[4093],data[4092],data[4091],data[4090],data[4089],data[4088]);
-
-
-  return retval >> d_ADDSHIFT;
+    unsigned char* d;
+    int d_ADDSHIFT = 0;
+    int d_numstates = (1 << 6);
+    int d_decision_t_size = d_numstates / 8;
+    unsigned int d_k = 7;
+    int d_framebits = nbits;
+    /* ADDSHIFT and SUBSHIFT make sure that the thing returned is a byte. */
+    d = decisions;
+    /* Make room beyond the end of the encoder register so we can
+     * accumulate a full byte of decoded data
+     */
+
+    endstate = (endstate % d_numstates) << d_ADDSHIFT;
+
+    /* The store into data[] only needs to be done every 8 bits.
+     * But this avoids a conditional branch, and the writes will
+     * combine in the cache anyway
+     */
+
+    d += tailsize * d_decision_t_size; /* Look past tail */
+    int retval;
+    int dif = tailsize - (d_k - 1);
+    // printf("break, %d, %d\n", dif, (nbits+dif)%d_framebits);
+    p_decision_t dec;
+    while (nbits-- > d_framebits - (d_k - 1)) {
+        int k;
+        dec.t = &d[nbits * d_decision_t_size];
+        k = (dec.w[(endstate >> d_ADDSHIFT) / 32] >> ((endstate >> d_ADDSHIFT) % 32)) & 1;
+
+        endstate = (endstate >> 1) | (k << (d_k - 2 + d_ADDSHIFT));
+        // data[((nbits+dif)%nbits)>>3] = endstate>>d_SUBSHIFT;
+        // printf("%d, %d\n", k, (nbits+dif)%d_framebits);
+        data[((nbits + dif) % d_framebits)] = k;
+
+        retval = endstate;
+    }
+    nbits += 1;
+
+    while (nbits-- != 0) {
+        int k;
+
+        dec.t = &d[nbits * d_decision_t_size];
+
+        k = (dec.w[(endstate >> d_ADDSHIFT) / 32] >> ((endstate >> d_ADDSHIFT) % 32)) & 1;
+
+        endstate = (endstate >> 1) | (k << (d_k - 2 + d_ADDSHIFT));
+        data[((nbits + dif) % d_framebits)] = k;
+    }
+    // printf("%d, %d, %d, %d, %d, %d, %d, %d\n",
+    // data[4095],data[4094],data[4093],data[4092],data[4091],data[4090],data[4089],data[4088]);
+
+
+    return retval >> d_ADDSHIFT;
  }
  
  
  #if LV_HAVE_SSE3
  
-#include <pmmintrin.h>
  #include <emmintrin.h>
-#include <xmmintrin.h>
  #include <mmintrin.h>
+#include <pmmintrin.h>
  #include <stdio.h>
+#include <xmmintrin.h>
  
-static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms, unsigned char* dec, unsigned int framebits) {
-
-
-  static int once = 1;
-  int d_numstates = (1 << 6);
-  int rate = 2;
-  static unsigned char* D;
-  static unsigned char* Y;
-  static unsigned char* X;
-  static unsigned int excess = 6;
-  static unsigned char* Branchtab;
-  static unsigned char Partab[256];
-
-  int d_polys[2] = {79, 109};
-
-
-  if(once) {
-
-    X = (unsigned char*)volk_malloc(2*d_numstates, volk_get_alignment());
-    Y = X + d_numstates;
-    Branchtab = (unsigned char*)volk_malloc(d_numstates/2*rate, volk_get_alignment());
-    D = (unsigned char*)volk_malloc((d_numstates/8) * (framebits + 6), volk_get_alignment());
-    int state, i;
-    int cnt,ti;
-
-    /* Initialize parity lookup table */
-    for(i=0;i<256;i++){
-      cnt = 0;
-      ti = i;
-      while(ti){
-        if(ti & 1)
-          cnt++;
-        ti >>= 1;
-      }
-      Partab[i] = cnt & 1;
-    }
-    /*  Initialize the branch table */
-    for(state=0;state < d_numstates/2;state++){
-      for(i=0; i<rate; i++){
-        Branchtab[i*d_numstates/2+state] = parity((2*state) & d_polys[i], Partab) ? 255 : 0;
-      }
-    }
+static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms,
+                                                      unsigned char* dec,
+                                                      unsigned int framebits)
+{
  
-    once = 0;
-  }
  
-  //unbias the old_metrics
-  memset(X, 31, d_numstates);
+    static int once = 1;
+    int d_numstates = (1 << 6);
+    int rate = 2;
+    static unsigned char* D;
+    static unsigned char* Y;
+    static unsigned char* X;
+    static unsigned int excess = 6;
+    static unsigned char* Branchtab;
+    static unsigned char Partab[256];
+
+    int d_polys[2] = { 79, 109 };
+
+
+    if (once) {
+
+        X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment());
+        Y = X + d_numstates;
+        Branchtab =
+            (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment());
+        D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
+                                        volk_get_alignment());
+        int state, i;
+        int cnt, ti;
+
+        /* Initialize parity lookup table */
+        for (i = 0; i < 256; i++) {
+            cnt = 0;
+            ti = i;
+            while (ti) {
+                if (ti & 1)
+                    cnt++;
+                ti >>= 1;
+            }
+            Partab[i] = cnt & 1;
+        }
+        /*  Initialize the branch table */
+        for (state = 0; state < d_numstates / 2; state++) {
+            for (i = 0; i < rate; i++) {
+                Branchtab[i * d_numstates / 2 + state] =
+                    parity((2 * state) & d_polys[i], Partab) ? 255 : 0;
+            }
+        }
+
+        once = 0;
+    }
+
+    // unbias the old_metrics
+    memset(X, 31, d_numstates);
  
-  // initialize decisions
-  memset(D, 0, (d_numstates/8) * (framebits + 6));
+    // initialize decisions
+    memset(D, 0, (d_numstates / 8) * (framebits + 6));
  
-  volk_8u_x4_conv_k7_r2_8u_spiral(Y, X, syms, D, framebits/2 - excess, excess, Branchtab);
+    volk_8u_x4_conv_k7_r2_8u_spiral(
+        Y, X, syms, D, framebits / 2 - excess, excess, Branchtab);
  
-  unsigned int min = X[0];
-  int i = 0, state = 0;
-  for(i = 0; i < (d_numstates); ++i) {
-    if(X[i] < min) {
-      min = X[i];
-      state = i;
+    unsigned int min = X[0];
+    int i = 0, state = 0;
+    for (i = 0; i < (d_numstates); ++i) {
+        if (X[i] < min) {
+            min = X[i];
+            state = i;
+        }
      }
-  }
  
-  chainback_viterbi(dec, framebits/2 -excess, state, excess, D);
+    chainback_viterbi(dec, framebits / 2 - excess, state, excess, D);
  
-  return;
+    return;
  }
  
  #endif /*LV_HAVE_SSE3*/
@@ -185,151 +193,161 @@ static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms, unsig
  #include <immintrin.h>
  #include <stdio.h>
  
-static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* syms, unsigned char* dec, unsigned int framebits) {
-
-
-  static int once = 1;
-  int d_numstates = (1 << 6);
-  int rate = 2;
-  static unsigned char* D;
-  static unsigned char* Y;
-  static unsigned char* X;
-  static unsigned int excess = 6;
-  static unsigned char* Branchtab;
-  static unsigned char Partab[256];
-
-  int d_polys[2] = {79, 109};
-
-
-  if(once) {
-
-    X = (unsigned char*)volk_malloc(2*d_numstates, volk_get_alignment());
-    Y = X + d_numstates;
-    Branchtab = (unsigned char*)volk_malloc(d_numstates/2*rate, volk_get_alignment());
-    D = (unsigned char*)volk_malloc((d_numstates/8) * (framebits + 6), volk_get_alignment());
-    int state, i;
-    int cnt,ti;
-
-    /* Initialize parity lookup table */
-    for(i=0;i<256;i++){
-      cnt = 0;
-      ti = i;
-      while(ti){
-        if(ti & 1)
-          cnt++;
-        ti >>= 1;
-      }
-      Partab[i] = cnt & 1;
-    }
-    /*  Initialize the branch table */
-    for(state=0;state < d_numstates/2;state++){
-      for(i=0; i<rate; i++){
-        Branchtab[i*d_numstates/2+state] = parity((2*state) & d_polys[i], Partab) ? 255 : 0;
-      }
-    }
+static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* syms,
+                                                    unsigned char* dec,
+                                                    unsigned int framebits)
+{
  
-    once = 0;
-  }
  
-  //unbias the old_metrics
-  memset(X, 31, d_numstates);
+    static int once = 1;
+    int d_numstates = (1 << 6);
+    int rate = 2;
+    static unsigned char* D;
+    static unsigned char* Y;
+    static unsigned char* X;
+    static unsigned int excess = 6;
+    static unsigned char* Branchtab;
+    static unsigned char Partab[256];
+
+    int d_polys[2] = { 79, 109 };
+
+
+    if (once) {
+
+        X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment());
+        Y = X + d_numstates;
+        Branchtab =
+            (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment());
+        D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
+                                        volk_get_alignment());
+        int state, i;
+        int cnt, ti;
+
+        /* Initialize parity lookup table */
+        for (i = 0; i < 256; i++) {
+            cnt = 0;
+            ti = i;
+            while (ti) {
+                if (ti & 1)
+                    cnt++;
+                ti >>= 1;
+            }
+            Partab[i] = cnt & 1;
+        }
+        /*  Initialize the branch table */
+        for (state = 0; state < d_numstates / 2; state++) {
+            for (i = 0; i < rate; i++) {
+                Branchtab[i * d_numstates / 2 + state] =
+                    parity((2 * state) & d_polys[i], Partab) ? 255 : 0;
+            }
+        }
+
+        once = 0;
+    }
+
+    // unbias the old_metrics
+    memset(X, 31, d_numstates);
  
-  // initialize decisions
-  memset(D, 0, (d_numstates/8) * (framebits + 6));
+    // initialize decisions
+    memset(D, 0, (d_numstates / 8) * (framebits + 6));
  
-  volk_8u_x4_conv_k7_r2_8u_avx2(Y, X, syms, D, framebits/2 - excess, excess, Branchtab);
+    volk_8u_x4_conv_k7_r2_8u_avx2(
+        Y, X, syms, D, framebits / 2 - excess, excess, Branchtab);
  
-  unsigned int min = X[0];
-  int i = 0, state = 0;
-  for(i = 0; i < (d_numstates); ++i) {
-    if(X[i] < min) {
-      min = X[i];
-      state = i;
+    unsigned int min = X[0];
+    int i = 0, state = 0;
+    for (i = 0; i < (d_numstates); ++i) {
+        if (X[i] < min) {
+            min = X[i];
+            state = i;
+        }
      }
-  }
  
-  chainback_viterbi(dec, framebits/2 -excess, state, excess, D);
+    chainback_viterbi(dec, framebits / 2 - excess, state, excess, D);
  
-  return;
+    return;
  }
  
  #endif /*LV_HAVE_AVX2*/
  
  
-
  #if LV_HAVE_GENERIC
  
  
-static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* syms, unsigned char* dec, unsigned int framebits) {
-
-
-
-  static int once = 1;
-  int d_numstates = (1 << 6);
-  int rate = 2;
-  static unsigned char* Y;
-  static unsigned char* X;
-  static unsigned char* D;
-  static unsigned int excess = 6;
-  static unsigned char* Branchtab;
-  static unsigned char Partab[256];
-
-  int d_polys[2] = {79, 109};
-
-
-  if(once) {
-
-    X = (unsigned char*)volk_malloc(2*d_numstates, volk_get_alignment());
-    Y = X + d_numstates;
-    Branchtab = (unsigned char*)volk_malloc(d_numstates/2*rate, volk_get_alignment());
-    D = (unsigned char*)volk_malloc((d_numstates/8) * (framebits + 6), volk_get_alignment());
+static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* syms,
+                                                       unsigned char* dec,
+                                                       unsigned int framebits)
+{
  
-    int state, i;
-    int cnt,ti;
  
-    /* Initialize parity lookup table */
-    for(i=0;i<256;i++){
-      cnt = 0;
-      ti = i;
-      while(ti){
-        if(ti & 1)
-          cnt++;
-        ti >>= 1;
-      }
-      Partab[i] = cnt & 1;
+    static int once = 1;
+    int d_numstates = (1 << 6);
+    int rate = 2;
+    static unsigned char* Y;
+    static unsigned char* X;
+    static unsigned char* D;
+    static unsigned int excess = 6;
+    static unsigned char* Branchtab;
+    static unsigned char Partab[256];
+
+    int d_polys[2] = { 79, 109 };
+
+
+    if (once) {
+
+        X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment());
+        Y = X + d_numstates;
+        Branchtab =
+            (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment());
+        D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
+                                        volk_get_alignment());
+
+        int state, i;
+        int cnt, ti;
+
+        /* Initialize parity lookup table */
+        for (i = 0; i < 256; i++) {
+            cnt = 0;
+            ti = i;
+            while (ti) {
+                if (ti & 1)
+                    cnt++;
+                ti >>= 1;
+            }
+            Partab[i] = cnt & 1;
+        }
+        /*  Initialize the branch table */
+        for (state = 0; state < d_numstates / 2; state++) {
+            for (i = 0; i < rate; i++) {
+                Branchtab[i * d_numstates / 2 + state] =
+                    parity((2 * state) & d_polys[i], Partab) ? 255 : 0;
+            }
+        }
+
+        once = 0;
      }
-    /*  Initialize the branch table */
-    for(state=0;state < d_numstates/2;state++){
-      for(i=0; i<rate; i++){
-        Branchtab[i*d_numstates/2+state] = parity((2*state) & d_polys[i], Partab) ? 255 : 0;
-      }
-    }
-
-    once = 0;
-  }
  
-  //unbias the old_metrics
-  memset(X, 31, d_numstates);
+    // unbias the old_metrics
+    memset(X, 31, d_numstates);
  
-  // initialize decisions
-  memset(D, 0, (d_numstates/8) * (framebits + 6));
+    // initialize decisions
+    memset(D, 0, (d_numstates / 8) * (framebits + 6));
  
-  volk_8u_x4_conv_k7_r2_8u_generic(Y, X, syms, D, framebits/2 - excess, excess, Branchtab);
+    volk_8u_x4_conv_k7_r2_8u_generic(
+        Y, X, syms, D, framebits / 2 - excess, excess, Branchtab);
  
-  unsigned int min = X[0];
-  int i = 0, state = 0;
-  for(i = 0; i < (d_numstates); ++i) {
-    if(X[i] < min) {
-      min = X[i];
-      state = i;
+    unsigned int min = X[0];
+    int i = 0, state = 0;
+    for (i = 0; i < (d_numstates); ++i) {
+        if (X[i] < min) {
+            min = X[i];
+            state = i;
+        }
      }
-  }
-
-  chainback_viterbi(dec, framebits/2 -excess, state, excess, D);
-
-  return;
  
+    chainback_viterbi(dec, framebits / 2 - excess, state, excess, D);
  
+    return;
  }
  
  #endif /* LV_HAVE_GENERIC */
diff --git a/kernels/volk/volk_8u_x2_encodeframepolar_8u.h b/kernels/volk/volk_8u_x2_encodeframepolar_8u.h

index bc176ec800fbbd597802b787faef921cdcef5e50..e8d980d651f6ce986aaffcd08d8aa21b674a39be 100644 (file)
--- a/kernels/volk/volk_8u_x2_encodeframepolar_8u.h
+++ b/kernels/volk/volk_8u_x2_encodeframepolar_8u.h
@@ -28,172 +28,236 @@
  #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
  #include <string.h>
  
-static inline unsigned int
-log2_of_power_of_2(unsigned int val){
-  // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog
-  static const unsigned int b[] = {0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0,
-                                   0xFF00FF00, 0xFFFF0000};
-
-  unsigned int res = (val & b[0]) != 0;
-  res |= ((val & b[4]) != 0) << 4;
-  res |= ((val & b[3]) != 0) << 3;
-  res |= ((val & b[2]) != 0) << 2;
-  res |= ((val & b[1]) != 0) << 1;
-  return res;
+static inline unsigned int log2_of_power_of_2(unsigned int val)
+{
+    // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog
+    static const unsigned int b[] = {
+        0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000
+    };
+
+    unsigned int res = (val & b[0]) != 0;
+    res |= ((val & b[4]) != 0) << 4;
+    res |= ((val & b[3]) != 0) << 3;
+    res |= ((val & b[2]) != 0) << 2;
+    res |= ((val & b[1]) != 0) << 1;
+    return res;
  }
  
-static inline void
-encodepolar_single_stage(unsigned char* frame_ptr, const unsigned char* temp_ptr,
-                         const unsigned int num_branches, const unsigned int frame_half)
+static inline void encodepolar_single_stage(unsigned char* frame_ptr,
+                                            const unsigned char* temp_ptr,
+                                            const unsigned int num_branches,
+                                            const unsigned int frame_half)
  {
-  unsigned int branch, bit;
-  for(branch = 0; branch < num_branches; ++branch){
-    for(bit = 0; bit < frame_half; ++bit){
-      *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
-      *(frame_ptr + frame_half) = *(temp_ptr + 1);
-      ++frame_ptr;
-      temp_ptr += 2;
+    unsigned int branch, bit;
+    for (branch = 0; branch < num_branches; ++branch) {
+        for (bit = 0; bit < frame_half; ++bit) {
+            *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
+            *(frame_ptr + frame_half) = *(temp_ptr + 1);
+            ++frame_ptr;
+            temp_ptr += 2;
+        }
+        frame_ptr += frame_half;
      }
-    frame_ptr += frame_half;
-  }
  }
  
  #ifdef LV_HAVE_GENERIC
  
-static inline void
-volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame, unsigned char* temp,
-                                       unsigned int frame_size)
+static inline void volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame,
+                                                          unsigned char* temp,
+                                                          unsigned int frame_size)
  {
-  unsigned int stage = log2_of_power_of_2(frame_size);
-  unsigned int frame_half = frame_size >> 1;
-  unsigned int num_branches = 1;
-
-  while(stage){
-    // encode stage
-    encodepolar_single_stage(frame, temp, num_branches, frame_half);
-    memcpy(temp, frame, sizeof(unsigned char) * frame_size);
-
-    // update all the parameters.
-    num_branches = num_branches << 1;
-    frame_half = frame_half >> 1;
-    --stage;
-  }
+    unsigned int stage = log2_of_power_of_2(frame_size);
+    unsigned int frame_half = frame_size >> 1;
+    unsigned int num_branches = 1;
+
+    while (stage) {
+        // encode stage
+        encodepolar_single_stage(frame, temp, num_branches, frame_half);
+        memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+
+        // update all the parameters.
+        num_branches = num_branches << 1;
+        frame_half = frame_half >> 1;
+        --stage;
+    }
  }
  #endif /* LV_HAVE_GENERIC */
  
  #ifdef LV_HAVE_SSSE3
  #include <tmmintrin.h>
  
-static inline void
-volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame, unsigned char* temp,
-                                       unsigned int frame_size)
+static inline void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame,
+                                                          unsigned char* temp,
+                                                          unsigned int frame_size)
  {
-  const unsigned int po2 = log2_of_power_of_2(frame_size);
-
-  unsigned int stage = po2;
-  unsigned char* frame_ptr = frame;
-  unsigned char* temp_ptr = temp;
-
-  unsigned int frame_half = frame_size >> 1;
-  unsigned int num_branches = 1;
-  unsigned int branch;
-  unsigned int bit;
-
-  // prepare constants
-  const __m128i mask_stage1 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
-
-  // get some SIMD registers to play with.
-  __m128i r_frame0, r_temp0, shifted;
-
-  {
-    __m128i r_frame1, r_temp1;
-    const __m128i shuffle_separate = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
-
-    while(stage > 4){
-      frame_ptr = frame;
-      temp_ptr = temp;
-
-      // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
-      for(branch = 0; branch < num_branches; ++branch){
-        for(bit = 0; bit < frame_half; bit += 16){
-          r_temp0 = _mm_loadu_si128((__m128i *) temp_ptr);
-          temp_ptr += 16;
-          r_temp1 = _mm_loadu_si128((__m128i *) temp_ptr);
-          temp_ptr += 16;
-
-          shifted = _mm_srli_si128(r_temp0, 1);
-          shifted = _mm_and_si128(shifted, mask_stage1);
-          r_temp0 = _mm_xor_si128(shifted, r_temp0);
-          r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
-
-          shifted = _mm_srli_si128(r_temp1, 1);
-          shifted = _mm_and_si128(shifted, mask_stage1);
-          r_temp1 = _mm_xor_si128(shifted, r_temp1);
-          r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
-
-          r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
-          _mm_storeu_si128((__m128i*) frame_ptr, r_frame0);
-
-          r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
-          _mm_storeu_si128((__m128i*) (frame_ptr + frame_half), r_frame1);
-          frame_ptr += 16;
+    const unsigned int po2 = log2_of_power_of_2(frame_size);
+
+    unsigned int stage = po2;
+    unsigned char* frame_ptr = frame;
+    unsigned char* temp_ptr = temp;
+
+    unsigned int frame_half = frame_size >> 1;
+    unsigned int num_branches = 1;
+    unsigned int branch;
+    unsigned int bit;
+
+    // prepare constants
+    const __m128i mask_stage1 = _mm_set_epi8(0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF);
+
+    // get some SIMD registers to play with.
+    __m128i r_frame0, r_temp0, shifted;
+
+    {
+        __m128i r_frame1, r_temp1;
+        const __m128i shuffle_separate =
+            _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+
+        while (stage > 4) {
+            frame_ptr = frame;
+            temp_ptr = temp;
+
+            // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
+            for (branch = 0; branch < num_branches; ++branch) {
+                for (bit = 0; bit < frame_half; bit += 16) {
+                    r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
+                    temp_ptr += 16;
+                    r_temp1 = _mm_loadu_si128((__m128i*)temp_ptr);
+                    temp_ptr += 16;
+
+                    shifted = _mm_srli_si128(r_temp0, 1);
+                    shifted = _mm_and_si128(shifted, mask_stage1);
+                    r_temp0 = _mm_xor_si128(shifted, r_temp0);
+                    r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
+
+                    shifted = _mm_srli_si128(r_temp1, 1);
+                    shifted = _mm_and_si128(shifted, mask_stage1);
+                    r_temp1 = _mm_xor_si128(shifted, r_temp1);
+                    r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
+
+                    r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
+                    _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
+
+                    r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
+                    _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
+                    frame_ptr += 16;
+                }
+
+                frame_ptr += frame_half;
+            }
+            memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+
+            num_branches = num_branches << 1;
+            frame_half = frame_half >> 1;
+            stage--;
          }
-
-        frame_ptr += frame_half;
-      }
-      memcpy(temp, frame, sizeof(unsigned char) * frame_size);
-
-      num_branches = num_branches << 1;
-      frame_half = frame_half >> 1;
-      stage--;
      }
-  }
  
-  // This last part requires at least 16-bit frames.
-  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
+    // This last part requires at least 16-bit frames.
+    // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
  
-  // reset pointers to correct positions.
-  frame_ptr = frame;
-  temp_ptr = temp;
+    // reset pointers to correct positions.
+    frame_ptr = frame;
+    temp_ptr = temp;
  
-  // prefetch first chunk
-  __VOLK_PREFETCH(temp_ptr);
-
-  const __m128i shuffle_stage4 = _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
-  const __m128i mask_stage4 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-  const __m128i mask_stage3 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
-  const __m128i mask_stage2 = _mm_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
-
-  for(branch = 0; branch < num_branches; ++branch){
-    r_temp0 = _mm_loadu_si128((__m128i*) temp_ptr);
-
-    // prefetch next chunk
-    temp_ptr += 16;
+    // prefetch first chunk
      __VOLK_PREFETCH(temp_ptr);
  
-    // shuffle once for bit-reversal.
-    r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
-
-    shifted = _mm_srli_si128(r_temp0, 8);
-    shifted = _mm_and_si128(shifted, mask_stage4);
-    r_frame0 = _mm_xor_si128(shifted, r_temp0);
-
-    shifted = _mm_srli_si128(r_frame0, 4);
-    shifted = _mm_and_si128(shifted, mask_stage3);
-    r_frame0 = _mm_xor_si128(shifted, r_frame0);
-
-    shifted = _mm_srli_si128(r_frame0, 2);
-    shifted = _mm_and_si128(shifted, mask_stage2);
-    r_frame0 = _mm_xor_si128(shifted, r_frame0);
-
-    shifted = _mm_srli_si128(r_frame0, 1);
-    shifted = _mm_and_si128(shifted, mask_stage1);
-    r_frame0 = _mm_xor_si128(shifted, r_frame0);
-
-    // store result of chunk.
-    _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
-    frame_ptr += 16;
-  }
+    const __m128i shuffle_stage4 =
+        _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
+    const __m128i mask_stage4 = _mm_set_epi8(0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF);
+    const __m128i mask_stage3 = _mm_set_epi8(0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF);
+    const __m128i mask_stage2 = _mm_set_epi8(0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF,
+                                             0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF,
+                                             0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF,
+                                             0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF);
+
+    for (branch = 0; branch < num_branches; ++branch) {
+        r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
+
+        // prefetch next chunk
+        temp_ptr += 16;
+        __VOLK_PREFETCH(temp_ptr);
+
+        // shuffle once for bit-reversal.
+        r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
+
+        shifted = _mm_srli_si128(r_temp0, 8);
+        shifted = _mm_and_si128(shifted, mask_stage4);
+        r_frame0 = _mm_xor_si128(shifted, r_temp0);
+
+        shifted = _mm_srli_si128(r_frame0, 4);
+        shifted = _mm_and_si128(shifted, mask_stage3);
+        r_frame0 = _mm_xor_si128(shifted, r_frame0);
+
+        shifted = _mm_srli_si128(r_frame0, 2);
+        shifted = _mm_and_si128(shifted, mask_stage2);
+        r_frame0 = _mm_xor_si128(shifted, r_frame0);
+
+        shifted = _mm_srli_si128(r_frame0, 1);
+        shifted = _mm_and_si128(shifted, mask_stage1);
+        r_frame0 = _mm_xor_si128(shifted, r_frame0);
+
+        // store result of chunk.
+        _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
+        frame_ptr += 16;
+    }
  }
  
  #endif /* LV_HAVE_SSSE3 */
@@ -201,154 +265,351 @@ volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame, unsigned char* temp
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame, unsigned char* temp,
-                                       unsigned int frame_size)
+static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame,
+                                                         unsigned char* temp,
+                                                         unsigned int frame_size)
  {
-  const unsigned int po2 = log2_of_power_of_2(frame_size);
-
-  unsigned int stage = po2;
-  unsigned char* frame_ptr = frame;
-  unsigned char* temp_ptr = temp;
-
-  unsigned int frame_half = frame_size >> 1;
-  unsigned int num_branches = 1;
-  unsigned int branch;
-  unsigned int bit;
-
-  // prepare constants
-  const __m256i mask_stage1 = _mm256_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF,
-                                              0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
-
-  const __m128i mask_stage0 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
-  // get some SIMD registers to play with.
-  __m256i r_frame0, r_temp0, shifted;
-  __m128i r_temp2, r_frame2, shifted2;
-  {
-    __m256i r_frame1, r_temp1;
-    __m128i r_frame3, r_temp3;
-    const __m256i shuffle_separate = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
-                                                      0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
-    const __m128i shuffle_separate128 = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
-
-    while(stage > 4){
-      frame_ptr = frame;
-      temp_ptr = temp;
-
-      // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
-      for(branch = 0; branch < num_branches; ++branch){
-        for(bit = 0; bit < frame_half; bit += 32){
-          if ((frame_half-bit)<32) //if only 16 bits remaining in frame, not 32
-          {
-              r_temp2 = _mm_loadu_si128((__m128i *) temp_ptr);
-              temp_ptr += 16;
-              r_temp3 = _mm_loadu_si128((__m128i *) temp_ptr);
-              temp_ptr += 16;
-
-              shifted2 = _mm_srli_si128(r_temp2, 1);
-              shifted2 = _mm_and_si128(shifted2, mask_stage0);
-              r_temp2 = _mm_xor_si128(shifted2, r_temp2);
-              r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
-
-              shifted2 = _mm_srli_si128(r_temp3, 1);
-              shifted2 = _mm_and_si128(shifted2, mask_stage0);
-              r_temp3 = _mm_xor_si128(shifted2, r_temp3);
-              r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
-
-              r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
-              _mm_storeu_si128((__m128i*) frame_ptr, r_frame2);
-
-              r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
-              _mm_storeu_si128((__m128i*) (frame_ptr + frame_half), r_frame3);
-              frame_ptr += 16;
-              break;
-          }
-          r_temp0 = _mm256_loadu_si256((__m256i *) temp_ptr);
-          temp_ptr += 32;
-          r_temp1 = _mm256_loadu_si256((__m256i *) temp_ptr);
-          temp_ptr += 32;
-
-          shifted = _mm256_srli_si256(r_temp0, 1);//operate on 128 bit lanes
-          shifted = _mm256_and_si256(shifted, mask_stage1);
-          r_temp0 = _mm256_xor_si256(shifted, r_temp0);
-          r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
-
-          shifted = _mm256_srli_si256(r_temp1, 1);
-          shifted = _mm256_and_si256(shifted, mask_stage1);
-          r_temp1 = _mm256_xor_si256(shifted, r_temp1);
-          r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
-
-          r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
-          r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
-          r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
-          r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
-
-          _mm256_storeu_si256((__m256i*) frame_ptr, r_frame0);
-
-          _mm256_storeu_si256((__m256i*) (frame_ptr + frame_half), r_frame1);
-          frame_ptr += 32;
+    const unsigned int po2 = log2_of_power_of_2(frame_size);
+
+    unsigned int stage = po2;
+    unsigned char* frame_ptr = frame;
+    unsigned char* temp_ptr = temp;
+
+    unsigned int frame_half = frame_size >> 1;
+    unsigned int num_branches = 1;
+    unsigned int branch;
+    unsigned int bit;
+
+    // prepare constants
+    const __m256i mask_stage1 = _mm256_set_epi8(0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF);
+
+    const __m128i mask_stage0 = _mm_set_epi8(0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF);
+    // get some SIMD registers to play with.
+    __m256i r_frame0, r_temp0, shifted;
+    __m128i r_temp2, r_frame2, shifted2;
+    {
+        __m256i r_frame1, r_temp1;
+        __m128i r_frame3, r_temp3;
+        const __m256i shuffle_separate = _mm256_setr_epi8(0,
+                                                          2,
+                                                          4,
+                                                          6,
+                                                          8,
+                                                          10,
+                                                          12,
+                                                          14,
+                                                          1,
+                                                          3,
+                                                          5,
+                                                          7,
+                                                          9,
+                                                          11,
+                                                          13,
+                                                          15,
+                                                          0,
+                                                          2,
+                                                          4,
+                                                          6,
+                                                          8,
+                                                          10,
+                                                          12,
+                                                          14,
+                                                          1,
+                                                          3,
+                                                          5,
+                                                          7,
+                                                          9,
+                                                          11,
+                                                          13,
+                                                          15);
+        const __m128i shuffle_separate128 =
+            _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+
+        while (stage > 4) {
+            frame_ptr = frame;
+            temp_ptr = temp;
+
+            // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
+            for (branch = 0; branch < num_branches; ++branch) {
+                for (bit = 0; bit < frame_half; bit += 32) {
+                    if ((frame_half - bit) <
+                        32) // if only 16 bits remaining in frame, not 32
+                    {
+                        r_temp2 = _mm_loadu_si128((__m128i*)temp_ptr);
+                        temp_ptr += 16;
+                        r_temp3 = _mm_loadu_si128((__m128i*)temp_ptr);
+                        temp_ptr += 16;
+
+                        shifted2 = _mm_srli_si128(r_temp2, 1);
+                        shifted2 = _mm_and_si128(shifted2, mask_stage0);
+                        r_temp2 = _mm_xor_si128(shifted2, r_temp2);
+                        r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
+
+                        shifted2 = _mm_srli_si128(r_temp3, 1);
+                        shifted2 = _mm_and_si128(shifted2, mask_stage0);
+                        r_temp3 = _mm_xor_si128(shifted2, r_temp3);
+                        r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
+
+                        r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
+                        _mm_storeu_si128((__m128i*)frame_ptr, r_frame2);
+
+                        r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
+                        _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
+                        frame_ptr += 16;
+                        break;
+                    }
+                    r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
+                    temp_ptr += 32;
+                    r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr);
+                    temp_ptr += 32;
+
+                    shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
+                    shifted = _mm256_and_si256(shifted, mask_stage1);
+                    r_temp0 = _mm256_xor_si256(shifted, r_temp0);
+                    r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
+
+                    shifted = _mm256_srli_si256(r_temp1, 1);
+                    shifted = _mm256_and_si256(shifted, mask_stage1);
+                    r_temp1 = _mm256_xor_si256(shifted, r_temp1);
+                    r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
+
+                    r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
+                    r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
+                    r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
+                    r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
+
+                    _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
+
+                    _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
+                    frame_ptr += 32;
+                }
+
+                frame_ptr += frame_half;
+            }
+            memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+
+            num_branches = num_branches << 1;
+            frame_half = frame_half >> 1;
+            stage--;
          }
-
-        frame_ptr += frame_half;
-      }
-      memcpy(temp, frame, sizeof(unsigned char) * frame_size);
-
-      num_branches = num_branches << 1;
-      frame_half = frame_half >> 1;
-      stage--;
      }
-  }
-
-  // This last part requires at least 32-bit frames.
-  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
-
-  // reset pointers to correct positions.
-  frame_ptr = frame;
-  temp_ptr = temp;
  
-  // prefetch first chunk
-  __VOLK_PREFETCH(temp_ptr);
+    // This last part requires at least 32-bit frames.
+    // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
  
-  const __m256i shuffle_stage4 = _mm256_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
-                                                  0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
-  const __m256i mask_stage4 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
-                                              0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-  const __m256i mask_stage3 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF,
-                                              0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
-  const __m256i mask_stage2 = _mm256_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF,
-                                              0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
+    // reset pointers to correct positions.
+    frame_ptr = frame;
+    temp_ptr = temp;
  
-  for(branch = 0; branch < num_branches/2; ++branch){
-    r_temp0 = _mm256_loadu_si256((__m256i*) temp_ptr);
-
-    // prefetch next chunk
-    temp_ptr += 32;
+    // prefetch first chunk
      __VOLK_PREFETCH(temp_ptr);
  
-    // shuffle once for bit-reversal.
-    r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
-
-    shifted = _mm256_srli_si256(r_temp0, 8); //128 bit lanes
-    shifted = _mm256_and_si256(shifted, mask_stage4);
-    r_frame0 = _mm256_xor_si256(shifted, r_temp0);
-
-
-    shifted = _mm256_srli_si256(r_frame0, 4);
-    shifted = _mm256_and_si256(shifted, mask_stage3);
-    r_frame0 = _mm256_xor_si256(shifted, r_frame0);
-
-    shifted = _mm256_srli_si256(r_frame0, 2);
-    shifted = _mm256_and_si256(shifted, mask_stage2);
-    r_frame0 = _mm256_xor_si256(shifted, r_frame0);
-
-    shifted = _mm256_srli_si256(r_frame0, 1);
-    shifted = _mm256_and_si256(shifted, mask_stage1);
-    r_frame0 = _mm256_xor_si256(shifted, r_frame0);
-
-    // store result of chunk.
-    _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
-    frame_ptr += 32;
-  }
+    const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
+                                                    8,
+                                                    4,
+                                                    12,
+                                                    2,
+                                                    10,
+                                                    6,
+                                                    14,
+                                                    1,
+                                                    9,
+                                                    5,
+                                                    13,
+                                                    3,
+                                                    11,
+                                                    7,
+                                                    15,
+                                                    0,
+                                                    8,
+                                                    4,
+                                                    12,
+                                                    2,
+                                                    10,
+                                                    6,
+                                                    14,
+                                                    1,
+                                                    9,
+                                                    5,
+                                                    13,
+                                                    3,
+                                                    11,
+                                                    7,
+                                                    15);
+    const __m256i mask_stage4 = _mm256_set_epi8(0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF);
+    const __m256i mask_stage3 = _mm256_set_epi8(0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF);
+    const __m256i mask_stage2 = _mm256_set_epi8(0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF);
+
+    for (branch = 0; branch < num_branches / 2; ++branch) {
+        r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
+
+        // prefetch next chunk
+        temp_ptr += 32;
+        __VOLK_PREFETCH(temp_ptr);
+
+        // shuffle once for bit-reversal.
+        r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
+
+        shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
+        shifted = _mm256_and_si256(shifted, mask_stage4);
+        r_frame0 = _mm256_xor_si256(shifted, r_temp0);
+
+
+        shifted = _mm256_srli_si256(r_frame0, 4);
+        shifted = _mm256_and_si256(shifted, mask_stage3);
+        r_frame0 = _mm256_xor_si256(shifted, r_frame0);
+
+        shifted = _mm256_srli_si256(r_frame0, 2);
+        shifted = _mm256_and_si256(shifted, mask_stage2);
+        r_frame0 = _mm256_xor_si256(shifted, r_frame0);
+
+        shifted = _mm256_srli_si256(r_frame0, 1);
+        shifted = _mm256_and_si256(shifted, mask_stage1);
+        r_frame0 = _mm256_xor_si256(shifted, r_frame0);
+
+        // store result of chunk.
+        _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
+        frame_ptr += 32;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -360,272 +621,530 @@ volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame, unsigned char* temp,
  #ifdef LV_HAVE_SSSE3
  #include <tmmintrin.h>
  
-static inline void
-volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame, unsigned char* temp,
-                                       unsigned int frame_size)
+static inline void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame,
+                                                          unsigned char* temp,
+                                                          unsigned int frame_size)
  {
-  const unsigned int po2 = log2_of_power_of_2(frame_size);
-
-  unsigned int stage = po2;
-  unsigned char* frame_ptr = frame;
-  unsigned char* temp_ptr = temp;
-
-  unsigned int frame_half = frame_size >> 1;
-  unsigned int num_branches = 1;
-  unsigned int branch;
-  unsigned int bit;
-
-  // prepare constants
-  const __m128i mask_stage1 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
-
-  // get some SIMD registers to play with.
-  __m128i r_frame0, r_temp0, shifted;
-
-  {
-    __m128i r_frame1, r_temp1;
-    const __m128i shuffle_separate = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
-
-    while(stage > 4){
-      frame_ptr = frame;
-      temp_ptr = temp;
-
-      // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
-      for(branch = 0; branch < num_branches; ++branch){
-        for(bit = 0; bit < frame_half; bit += 16){
-          r_temp0 = _mm_load_si128((__m128i *) temp_ptr);
-          temp_ptr += 16;
-          r_temp1 = _mm_load_si128((__m128i *) temp_ptr);
-          temp_ptr += 16;
-
-          shifted = _mm_srli_si128(r_temp0, 1);
-          shifted = _mm_and_si128(shifted, mask_stage1);
-          r_temp0 = _mm_xor_si128(shifted, r_temp0);
-          r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
-
-          shifted = _mm_srli_si128(r_temp1, 1);
-          shifted = _mm_and_si128(shifted, mask_stage1);
-          r_temp1 = _mm_xor_si128(shifted, r_temp1);
-          r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
-
-          r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
-          _mm_store_si128((__m128i*) frame_ptr, r_frame0);
-
-          r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
-          _mm_store_si128((__m128i*) (frame_ptr + frame_half), r_frame1);
-          frame_ptr += 16;
+    const unsigned int po2 = log2_of_power_of_2(frame_size);
+
+    unsigned int stage = po2;
+    unsigned char* frame_ptr = frame;
+    unsigned char* temp_ptr = temp;
+
+    unsigned int frame_half = frame_size >> 1;
+    unsigned int num_branches = 1;
+    unsigned int branch;
+    unsigned int bit;
+
+    // prepare constants
+    const __m128i mask_stage1 = _mm_set_epi8(0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF);
+
+    // get some SIMD registers to play with.
+    __m128i r_frame0, r_temp0, shifted;
+
+    {
+        __m128i r_frame1, r_temp1;
+        const __m128i shuffle_separate =
+            _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+
+        while (stage > 4) {
+            frame_ptr = frame;
+            temp_ptr = temp;
+
+            // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
+            for (branch = 0; branch < num_branches; ++branch) {
+                for (bit = 0; bit < frame_half; bit += 16) {
+                    r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
+                    temp_ptr += 16;
+                    r_temp1 = _mm_load_si128((__m128i*)temp_ptr);
+                    temp_ptr += 16;
+
+                    shifted = _mm_srli_si128(r_temp0, 1);
+                    shifted = _mm_and_si128(shifted, mask_stage1);
+                    r_temp0 = _mm_xor_si128(shifted, r_temp0);
+                    r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
+
+                    shifted = _mm_srli_si128(r_temp1, 1);
+                    shifted = _mm_and_si128(shifted, mask_stage1);
+                    r_temp1 = _mm_xor_si128(shifted, r_temp1);
+                    r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
+
+                    r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
+                    _mm_store_si128((__m128i*)frame_ptr, r_frame0);
+
+                    r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
+                    _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
+                    frame_ptr += 16;
+                }
+
+                frame_ptr += frame_half;
+            }
+            memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+
+            num_branches = num_branches << 1;
+            frame_half = frame_half >> 1;
+            stage--;
          }
-
-        frame_ptr += frame_half;
-      }
-      memcpy(temp, frame, sizeof(unsigned char) * frame_size);
-
-      num_branches = num_branches << 1;
-      frame_half = frame_half >> 1;
-      stage--;
      }
-  }
-
-  // This last part requires at least 16-bit frames.
-  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
-
-  // reset pointers to correct positions.
-  frame_ptr = frame;
-  temp_ptr = temp;
  
-  // prefetch first chunk
-  __VOLK_PREFETCH(temp_ptr);
+    // This last part requires at least 16-bit frames.
+    // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
  
-  const __m128i shuffle_stage4 = _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
-  const __m128i mask_stage4 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-  const __m128i mask_stage3 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
-  const __m128i mask_stage2 = _mm_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
+    // reset pointers to correct positions.
+    frame_ptr = frame;
+    temp_ptr = temp;
  
-  for(branch = 0; branch < num_branches; ++branch){
-    r_temp0 = _mm_load_si128((__m128i*) temp_ptr);
-
-    // prefetch next chunk
-    temp_ptr += 16;
+    // prefetch first chunk
      __VOLK_PREFETCH(temp_ptr);
  
-    // shuffle once for bit-reversal.
-    r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
-
-    shifted = _mm_srli_si128(r_temp0, 8);
-    shifted = _mm_and_si128(shifted, mask_stage4);
-    r_frame0 = _mm_xor_si128(shifted, r_temp0);
-
-    shifted = _mm_srli_si128(r_frame0, 4);
-    shifted = _mm_and_si128(shifted, mask_stage3);
-    r_frame0 = _mm_xor_si128(shifted, r_frame0);
-
-    shifted = _mm_srli_si128(r_frame0, 2);
-    shifted = _mm_and_si128(shifted, mask_stage2);
-    r_frame0 = _mm_xor_si128(shifted, r_frame0);
-
-    shifted = _mm_srli_si128(r_frame0, 1);
-    shifted = _mm_and_si128(shifted, mask_stage1);
-    r_frame0 = _mm_xor_si128(shifted, r_frame0);
-
-    // store result of chunk.
-    _mm_store_si128((__m128i*)frame_ptr, r_frame0);
-    frame_ptr += 16;
-  }
+    const __m128i shuffle_stage4 =
+        _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
+    const __m128i mask_stage4 = _mm_set_epi8(0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF);
+    const __m128i mask_stage3 = _mm_set_epi8(0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF);
+    const __m128i mask_stage2 = _mm_set_epi8(0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF,
+                                             0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF,
+                                             0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF,
+                                             0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF);
+
+    for (branch = 0; branch < num_branches; ++branch) {
+        r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
+
+        // prefetch next chunk
+        temp_ptr += 16;
+        __VOLK_PREFETCH(temp_ptr);
+
+        // shuffle once for bit-reversal.
+        r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
+
+        shifted = _mm_srli_si128(r_temp0, 8);
+        shifted = _mm_and_si128(shifted, mask_stage4);
+        r_frame0 = _mm_xor_si128(shifted, r_temp0);
+
+        shifted = _mm_srli_si128(r_frame0, 4);
+        shifted = _mm_and_si128(shifted, mask_stage3);
+        r_frame0 = _mm_xor_si128(shifted, r_frame0);
+
+        shifted = _mm_srli_si128(r_frame0, 2);
+        shifted = _mm_and_si128(shifted, mask_stage2);
+        r_frame0 = _mm_xor_si128(shifted, r_frame0);
+
+        shifted = _mm_srli_si128(r_frame0, 1);
+        shifted = _mm_and_si128(shifted, mask_stage1);
+        r_frame0 = _mm_xor_si128(shifted, r_frame0);
+
+        // store result of chunk.
+        _mm_store_si128((__m128i*)frame_ptr, r_frame0);
+        frame_ptr += 16;
+    }
  }
  #endif /* LV_HAVE_SSSE3 */
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  
-static inline void
-volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame, unsigned char* temp,
-                                       unsigned int frame_size)
+static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame,
+                                                         unsigned char* temp,
+                                                         unsigned int frame_size)
  {
-  const unsigned int po2 = log2_of_power_of_2(frame_size);
-
-  unsigned int stage = po2;
-  unsigned char* frame_ptr = frame;
-  unsigned char* temp_ptr = temp;
-
-  unsigned int frame_half = frame_size >> 1;
-  unsigned int num_branches = 1;
-  unsigned int branch;
-  unsigned int bit;
-
-  // prepare constants
-  const __m256i mask_stage1 = _mm256_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF,
-                                              0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
-
-  const __m128i mask_stage0 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
-  // get some SIMD registers to play with.
-  __m256i r_frame0, r_temp0, shifted;
-  __m128i r_temp2, r_frame2, shifted2;
-  {
-    __m256i r_frame1, r_temp1;
-    __m128i r_frame3, r_temp3;
-    const __m256i shuffle_separate = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
-                                                      0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
-    const __m128i shuffle_separate128 = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
-
-    while(stage > 4){
-      frame_ptr = frame;
-      temp_ptr = temp;
-
-      // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
-      for(branch = 0; branch < num_branches; ++branch){
-        for(bit = 0; bit < frame_half; bit += 32){
-          if ((frame_half-bit)<32) //if only 16 bits remaining in frame, not 32
-          {
-              r_temp2 = _mm_load_si128((__m128i *) temp_ptr);
-              temp_ptr += 16;
-              r_temp3 = _mm_load_si128((__m128i *) temp_ptr);
-              temp_ptr += 16;
-
-              shifted2 = _mm_srli_si128(r_temp2, 1);
-              shifted2 = _mm_and_si128(shifted2, mask_stage0);
-              r_temp2 = _mm_xor_si128(shifted2, r_temp2);
-              r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
-
-              shifted2 = _mm_srli_si128(r_temp3, 1);
-              shifted2 = _mm_and_si128(shifted2, mask_stage0);
-              r_temp3 = _mm_xor_si128(shifted2, r_temp3);
-              r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
-
-              r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
-              _mm_store_si128((__m128i*) frame_ptr, r_frame2);
-
-              r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
-              _mm_store_si128((__m128i*) (frame_ptr + frame_half), r_frame3);
-              frame_ptr += 16;
-              break;
-          }
-          r_temp0 = _mm256_load_si256((__m256i *) temp_ptr);
-          temp_ptr += 32;
-          r_temp1 = _mm256_load_si256((__m256i *) temp_ptr);
-          temp_ptr += 32;
-
-          shifted = _mm256_srli_si256(r_temp0, 1);//operate on 128 bit lanes
-          shifted = _mm256_and_si256(shifted, mask_stage1);
-          r_temp0 = _mm256_xor_si256(shifted, r_temp0);
-          r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
-
-          shifted = _mm256_srli_si256(r_temp1, 1);
-          shifted = _mm256_and_si256(shifted, mask_stage1);
-          r_temp1 = _mm256_xor_si256(shifted, r_temp1);
-          r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
-
-          r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
-          r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
-          r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
-          r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
-
-          _mm256_store_si256((__m256i*) frame_ptr, r_frame0);
-
-          _mm256_store_si256((__m256i*) (frame_ptr + frame_half), r_frame1);
-          frame_ptr += 32;
+    const unsigned int po2 = log2_of_power_of_2(frame_size);
+
+    unsigned int stage = po2;
+    unsigned char* frame_ptr = frame;
+    unsigned char* temp_ptr = temp;
+
+    unsigned int frame_half = frame_size >> 1;
+    unsigned int num_branches = 1;
+    unsigned int branch;
+    unsigned int bit;
+
+    // prepare constants
+    const __m256i mask_stage1 = _mm256_set_epi8(0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF);
+
+    const __m128i mask_stage0 = _mm_set_epi8(0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF);
+    // get some SIMD registers to play with.
+    __m256i r_frame0, r_temp0, shifted;
+    __m128i r_temp2, r_frame2, shifted2;
+    {
+        __m256i r_frame1, r_temp1;
+        __m128i r_frame3, r_temp3;
+        const __m256i shuffle_separate = _mm256_setr_epi8(0,
+                                                          2,
+                                                          4,
+                                                          6,
+                                                          8,
+                                                          10,
+                                                          12,
+                                                          14,
+                                                          1,
+                                                          3,
+                                                          5,
+                                                          7,
+                                                          9,
+                                                          11,
+                                                          13,
+                                                          15,
+                                                          0,
+                                                          2,
+                                                          4,
+                                                          6,
+                                                          8,
+                                                          10,
+                                                          12,
+                                                          14,
+                                                          1,
+                                                          3,
+                                                          5,
+                                                          7,
+                                                          9,
+                                                          11,
+                                                          13,
+                                                          15);
+        const __m128i shuffle_separate128 =
+            _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+
+        while (stage > 4) {
+            frame_ptr = frame;
+            temp_ptr = temp;
+
+            // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
+            for (branch = 0; branch < num_branches; ++branch) {
+                for (bit = 0; bit < frame_half; bit += 32) {
+                    if ((frame_half - bit) <
+                        32) // if only 16 bits remaining in frame, not 32
+                    {
+                        r_temp2 = _mm_load_si128((__m128i*)temp_ptr);
+                        temp_ptr += 16;
+                        r_temp3 = _mm_load_si128((__m128i*)temp_ptr);
+                        temp_ptr += 16;
+
+                        shifted2 = _mm_srli_si128(r_temp2, 1);
+                        shifted2 = _mm_and_si128(shifted2, mask_stage0);
+                        r_temp2 = _mm_xor_si128(shifted2, r_temp2);
+                        r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
+
+                        shifted2 = _mm_srli_si128(r_temp3, 1);
+                        shifted2 = _mm_and_si128(shifted2, mask_stage0);
+                        r_temp3 = _mm_xor_si128(shifted2, r_temp3);
+                        r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
+
+                        r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
+                        _mm_store_si128((__m128i*)frame_ptr, r_frame2);
+
+                        r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
+                        _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
+                        frame_ptr += 16;
+                        break;
+                    }
+                    r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
+                    temp_ptr += 32;
+                    r_temp1 = _mm256_load_si256((__m256i*)temp_ptr);
+                    temp_ptr += 32;
+
+                    shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
+                    shifted = _mm256_and_si256(shifted, mask_stage1);
+                    r_temp0 = _mm256_xor_si256(shifted, r_temp0);
+                    r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
+
+                    shifted = _mm256_srli_si256(r_temp1, 1);
+                    shifted = _mm256_and_si256(shifted, mask_stage1);
+                    r_temp1 = _mm256_xor_si256(shifted, r_temp1);
+                    r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
+
+                    r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
+                    r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
+                    r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
+                    r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
+
+                    _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
+
+                    _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
+                    frame_ptr += 32;
+                }
+
+                frame_ptr += frame_half;
+            }
+            memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+
+            num_branches = num_branches << 1;
+            frame_half = frame_half >> 1;
+            stage--;
          }
-
-        frame_ptr += frame_half;
-      }
-      memcpy(temp, frame, sizeof(unsigned char) * frame_size);
-
-      num_branches = num_branches << 1;
-      frame_half = frame_half >> 1;
-      stage--;
      }
-  }
-
-  // This last part requires at least 32-bit frames.
-  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
  
-  // reset pointers to correct positions.
-  frame_ptr = frame;
-  temp_ptr = temp;
+    // This last part requires at least 32-bit frames.
+    // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
  
-  // prefetch first chunk.
-  __VOLK_PREFETCH(temp_ptr);
+    // reset pointers to correct positions.
+    frame_ptr = frame;
+    temp_ptr = temp;
  
-  const __m256i shuffle_stage4 = _mm256_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
-                                                  0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
-  const __m256i mask_stage4 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
-                                              0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-  const __m256i mask_stage3 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF,
-                                              0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
-  const __m256i mask_stage2 = _mm256_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF,
-                                              0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
-
-  for(branch = 0; branch < num_branches/2; ++branch){
-    r_temp0 = _mm256_load_si256((__m256i*) temp_ptr);
-
-    // prefetch next chunk
-    temp_ptr += 32;
+    // prefetch first chunk.
      __VOLK_PREFETCH(temp_ptr);
  
-    // shuffle once for bit-reversal.
-    r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
-
-    shifted = _mm256_srli_si256(r_temp0, 8); //128 bit lanes
-    shifted = _mm256_and_si256(shifted, mask_stage4);
-    r_frame0 = _mm256_xor_si256(shifted, r_temp0);
-
-    shifted = _mm256_srli_si256(r_frame0, 4);
-    shifted = _mm256_and_si256(shifted, mask_stage3);
-    r_frame0 = _mm256_xor_si256(shifted, r_frame0);
-
-    shifted = _mm256_srli_si256(r_frame0, 2);
-    shifted = _mm256_and_si256(shifted, mask_stage2);
-    r_frame0 = _mm256_xor_si256(shifted, r_frame0);
-
-    shifted = _mm256_srli_si256(r_frame0, 1);
-    shifted = _mm256_and_si256(shifted, mask_stage1);
-    r_frame0 = _mm256_xor_si256(shifted, r_frame0);
-
-    // store result of chunk.
-    _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
-    frame_ptr += 32;
-  }
+    const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
+                                                    8,
+                                                    4,
+                                                    12,
+                                                    2,
+                                                    10,
+                                                    6,
+                                                    14,
+                                                    1,
+                                                    9,
+                                                    5,
+                                                    13,
+                                                    3,
+                                                    11,
+                                                    7,
+                                                    15,
+                                                    0,
+                                                    8,
+                                                    4,
+                                                    12,
+                                                    2,
+                                                    10,
+                                                    6,
+                                                    14,
+                                                    1,
+                                                    9,
+                                                    5,
+                                                    13,
+                                                    3,
+                                                    11,
+                                                    7,
+                                                    15);
+    const __m256i mask_stage4 = _mm256_set_epi8(0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF);
+    const __m256i mask_stage3 = _mm256_set_epi8(0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF);
+    const __m256i mask_stage2 = _mm256_set_epi8(0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF);
+
+    for (branch = 0; branch < num_branches / 2; ++branch) {
+        r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
+
+        // prefetch next chunk
+        temp_ptr += 32;
+        __VOLK_PREFETCH(temp_ptr);
+
+        // shuffle once for bit-reversal.
+        r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
+
+        shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
+        shifted = _mm256_and_si256(shifted, mask_stage4);
+        r_frame0 = _mm256_xor_si256(shifted, r_temp0);
+
+        shifted = _mm256_srli_si256(r_frame0, 4);
+        shifted = _mm256_and_si256(shifted, mask_stage3);
+        r_frame0 = _mm256_xor_si256(shifted, r_frame0);
+
+        shifted = _mm256_srli_si256(r_frame0, 2);
+        shifted = _mm256_and_si256(shifted, mask_stage2);
+        r_frame0 = _mm256_xor_si256(shifted, r_frame0);
+
+        shifted = _mm256_srli_si256(r_frame0, 1);
+        shifted = _mm256_and_si256(shifted, mask_stage1);
+        r_frame0 = _mm256_xor_si256(shifted, r_frame0);
+
+        // store result of chunk.
+        _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
+        frame_ptr += 32;
+    }
  }
  #endif /* LV_HAVE_AVX2 */
  
  
-
  #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ */
diff --git a/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h b/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h

index 5bccd95060d170b1f7fd0a548de783e1f5d3887c..413836ef246f1d1cb37a661c1accaac1119a1845 100644 (file)
--- a/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h
+++ b/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h
@@ -29,9 +29,9 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_8u_x3_encodepolar_8u(unsigned char* frame, const unsigned char* frozen_bit_mask, const unsigned char* frozen_bits,
- *                                  const unsigned char* info_bits, unsigned int frame_size, unsigned int info_bit_size)
- * \endcode
+ * void volk_8u_x3_encodepolar_8u(unsigned char* frame, const unsigned char*
+ * frozen_bit_mask, const unsigned char* frozen_bits, const unsigned char* info_bits,
+ * unsigned int frame_size, unsigned int info_bit_size) \endcode
   *
   * \b Inputs
   * \li frame: buffer for encoded frame
@@ -55,14 +55,17 @@
   * unsigned char* frozen_bit_mask = get_frozen_bit_mask(frame_size, num_frozen_bits);
   *
   * // set elements to desired values. Typically all zero.
- * unsigned char* frozen_bits = (unsigned char) volk_malloc(sizeof(unsigned char) * num_frozen_bits, volk_get_alignment());
+ * unsigned char* frozen_bits = (unsigned char) volk_malloc(sizeof(unsigned char) *
+ * num_frozen_bits, volk_get_alignment());
   *
- * unsigned char* frame = (unsigned char) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
- * unsigned char* temp = (unsigned char) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
+ * unsigned char* frame = (unsigned char) volk_malloc(sizeof(unsigned char) * frame_size,
+ * volk_get_alignment()); unsigned char* temp = (unsigned char)
+ * volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
   *
   * unsigned char* info_bits = get_info_bits_to_encode(num_info_bits);
   *
- * volk_8u_x3_encodepolar_8u_x2_generic(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+ * volk_8u_x3_encodepolar_8u_x2_generic(frame, temp, frozen_bit_mask, frozen_bits,
+ * info_bits, frame_size);
   *
   * volk_free(frozen_bit_mask);
   * volk_free(frozen_bits);
@@ -77,27 +80,32 @@
  #include <stdio.h>
  #include <volk/volk_8u_x2_encodeframepolar_8u.h>
  
-static inline void
-interleave_frozen_and_info_bits(unsigned char* target, const unsigned char* frozen_bit_mask,
-                                const unsigned char* frozen_bits, const unsigned char* info_bits,
-                                const unsigned int frame_size)
+static inline void interleave_frozen_and_info_bits(unsigned char* target,
+                                                   const unsigned char* frozen_bit_mask,
+                                                   const unsigned char* frozen_bits,
+                                                   const unsigned char* info_bits,
+                                                   const unsigned int frame_size)
  {
-  unsigned int bit;
-  for(bit = 0; bit < frame_size; ++bit){
-    *target++ = *frozen_bit_mask++ ? *frozen_bits++ : *info_bits++;
-  }
+    unsigned int bit;
+    for (bit = 0; bit < frame_size; ++bit) {
+        *target++ = *frozen_bit_mask++ ? *frozen_bits++ : *info_bits++;
+    }
  }
  
  #ifdef LV_HAVE_GENERIC
  
  static inline void
-volk_8u_x3_encodepolar_8u_x2_generic(unsigned char* frame, unsigned char* temp, const unsigned char* frozen_bit_mask,
-                                     const unsigned char* frozen_bits, const unsigned char* info_bits,
+volk_8u_x3_encodepolar_8u_x2_generic(unsigned char* frame,
+                                     unsigned char* temp,
+                                     const unsigned char* frozen_bit_mask,
+                                     const unsigned char* frozen_bits,
+                                     const unsigned char* info_bits,
                                       unsigned int frame_size)
  {
-  // interleave
-  interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
-  volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size);
+    // interleave
+    interleave_frozen_and_info_bits(
+        temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+    volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size);
  }
  #endif /* LV_HAVE_GENERIC */
  
@@ -106,14 +114,17 @@ volk_8u_x3_encodepolar_8u_x2_generic(unsigned char* frame, unsigned char* temp,
  #include <tmmintrin.h>
  
  static inline void
-volk_8u_x3_encodepolar_8u_x2_u_ssse3(unsigned char* frame, unsigned char* temp,
-                                   const unsigned char* frozen_bit_mask,
-                                   const unsigned char* frozen_bits, const unsigned char* info_bits,
-                                   unsigned int frame_size)
+volk_8u_x3_encodepolar_8u_x2_u_ssse3(unsigned char* frame,
+                                     unsigned char* temp,
+                                     const unsigned char* frozen_bit_mask,
+                                     const unsigned char* frozen_bits,
+                                     const unsigned char* info_bits,
+                                     unsigned int frame_size)
  {
-  // interleave
-  interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
-  volk_8u_x2_encodeframepolar_8u_u_ssse3(frame, temp, frame_size);
+    // interleave
+    interleave_frozen_and_info_bits(
+        temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+    volk_8u_x2_encodeframepolar_8u_u_ssse3(frame, temp, frame_size);
  }
  
  #endif /* LV_HAVE_SSSE3 */
@@ -121,13 +132,16 @@ volk_8u_x3_encodepolar_8u_x2_u_ssse3(unsigned char* frame, unsigned char* temp,
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  static inline void
-volk_8u_x3_encodepolar_8u_x2_u_avx2(unsigned char* frame, unsigned char* temp,
-                                   const unsigned char* frozen_bit_mask,
-                                   const unsigned char* frozen_bits, const unsigned char* info_bits,
-                                   unsigned int frame_size)
+volk_8u_x3_encodepolar_8u_x2_u_avx2(unsigned char* frame,
+                                    unsigned char* temp,
+                                    const unsigned char* frozen_bit_mask,
+                                    const unsigned char* frozen_bits,
+                                    const unsigned char* info_bits,
+                                    unsigned int frame_size)
  {
-  interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
-  volk_8u_x2_encodeframepolar_8u_u_avx2(frame, temp, frame_size);
+    interleave_frozen_and_info_bits(
+        temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+    volk_8u_x2_encodeframepolar_8u_u_avx2(frame, temp, frame_size);
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -139,26 +153,32 @@ volk_8u_x3_encodepolar_8u_x2_u_avx2(unsigned char* frame, unsigned char* temp,
  #ifdef LV_HAVE_SSSE3
  #include <tmmintrin.h>
  static inline void
-volk_8u_x3_encodepolar_8u_x2_a_ssse3(unsigned char* frame, unsigned char* temp,
-                                   const unsigned char* frozen_bit_mask,
-                                   const unsigned char* frozen_bits, const unsigned char* info_bits,
-                                   unsigned int frame_size)
+volk_8u_x3_encodepolar_8u_x2_a_ssse3(unsigned char* frame,
+                                     unsigned char* temp,
+                                     const unsigned char* frozen_bit_mask,
+                                     const unsigned char* frozen_bits,
+                                     const unsigned char* info_bits,
+                                     unsigned int frame_size)
  {
-  interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
-  volk_8u_x2_encodeframepolar_8u_a_ssse3(frame, temp, frame_size);
+    interleave_frozen_and_info_bits(
+        temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+    volk_8u_x2_encodeframepolar_8u_a_ssse3(frame, temp, frame_size);
  }
  #endif /* LV_HAVE_SSSE3 */
  
  #ifdef LV_HAVE_AVX2
  #include <immintrin.h>
  static inline void
-volk_8u_x3_encodepolar_8u_x2_a_avx2(unsigned char* frame, unsigned char* temp,
-                                   const unsigned char* frozen_bit_mask,
-                                   const unsigned char* frozen_bits, const unsigned char* info_bits,
-                                   unsigned int frame_size)
+volk_8u_x3_encodepolar_8u_x2_a_avx2(unsigned char* frame,
+                                    unsigned char* temp,
+                                    const unsigned char* frozen_bit_mask,
+                                    const unsigned char* frozen_bits,
+                                    const unsigned char* info_bits,
+                                    unsigned int frame_size)
  {
-  interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
-  volk_8u_x2_encodeframepolar_8u_a_avx2(frame, temp, frame_size);
+    interleave_frozen_and_info_bits(
+        temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+    volk_8u_x2_encodeframepolar_8u_a_avx2(frame, temp, frame_size);
  }
  #endif /* LV_HAVE_AVX2 */
  
diff --git a/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h b/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h

index 1f6be2c76c13bb8e81be7519001c19e0ccfb51b2..1badbf14ea88f667e6012d155b8a5642530ff59e 100644 (file)
--- a/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h
+++ b/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h
@@ -29,71 +29,82 @@
  #include <volk/volk.h>
  #include <volk/volk_8u_x3_encodepolar_8u_x2.h>
  
-static inline unsigned int
-next_lower_power_of_two(const unsigned int val)
+static inline unsigned int next_lower_power_of_two(const unsigned int val)
  {
-  // algorithm found and adopted from: http://acius2.blogspot.de/2007/11/calculating-next-power-of-2.html
-  unsigned int res = val;
-  res = (res >> 1) | res;
-  res = (res >> 2) | res;
-  res = (res >> 4) | res;
-  res = (res >> 8) | res;
-  res = (res >> 16) | res;
-  res += 1;
-  return res >> 1;
+    // algorithm found and adopted from:
+    // http://acius2.blogspot.de/2007/11/calculating-next-power-of-2.html
+    unsigned int res = val;
+    res = (res >> 1) | res;
+    res = (res >> 2) | res;
+    res = (res >> 4) | res;
+    res = (res >> 8) | res;
+    res = (res >> 16) | res;
+    res += 1;
+    return res >> 1;
  }
  
-static inline void
-adjust_frozen_mask(unsigned char* mask, const unsigned int frame_size)
+static inline void adjust_frozen_mask(unsigned char* mask, const unsigned int frame_size)
  {
-  // just like the rest of the puppet this function exists for test purposes only.
-  unsigned int i;
-  for(i = 0; i < frame_size; ++i){
-    *mask = (*mask & 0x80) ? 0xFF : 0x00;
-    mask++;
-  }
+    // just like the rest of the puppet this function exists for test purposes only.
+    unsigned int i;
+    for (i = 0; i < frame_size; ++i) {
+        *mask = (*mask & 0x80) ? 0xFF : 0x00;
+        mask++;
+    }
  }
  
  #ifdef LV_HAVE_GENERIC
  static inline void
-volk_8u_x3_encodepolarpuppet_8u_generic(unsigned char* frame, unsigned char* frozen_bit_mask,
-    const unsigned char* frozen_bits, const unsigned char* info_bits,
-    unsigned int frame_size)
+volk_8u_x3_encodepolarpuppet_8u_generic(unsigned char* frame,
+                                        unsigned char* frozen_bit_mask,
+                                        const unsigned char* frozen_bits,
+                                        const unsigned char* info_bits,
+                                        unsigned int frame_size)
  {
-  frame_size = next_lower_power_of_two(frame_size);
-  unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
-  adjust_frozen_mask(frozen_bit_mask, frame_size);
-  volk_8u_x3_encodepolar_8u_x2_generic(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
-  volk_free(temp);
+    frame_size = next_lower_power_of_two(frame_size);
+    unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size,
+                                                      volk_get_alignment());
+    adjust_frozen_mask(frozen_bit_mask, frame_size);
+    volk_8u_x3_encodepolar_8u_x2_generic(
+        frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+    volk_free(temp);
  }
  #endif /* LV_HAVE_GENERIC */
  
  
  #ifdef LV_HAVE_SSSE3
  static inline void
-volk_8u_x3_encodepolarpuppet_8u_u_ssse3(unsigned char* frame, unsigned char* frozen_bit_mask,
-    const unsigned char* frozen_bits, const unsigned char* info_bits,
-    unsigned int frame_size)
+volk_8u_x3_encodepolarpuppet_8u_u_ssse3(unsigned char* frame,
+                                        unsigned char* frozen_bit_mask,
+                                        const unsigned char* frozen_bits,
+                                        const unsigned char* info_bits,
+                                        unsigned int frame_size)
  {
-  frame_size = next_lower_power_of_two(frame_size);
-  unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
-  adjust_frozen_mask(frozen_bit_mask, frame_size);
-  volk_8u_x3_encodepolar_8u_x2_u_ssse3(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
-  volk_free(temp);
+    frame_size = next_lower_power_of_two(frame_size);
+    unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size,
+                                                      volk_get_alignment());
+    adjust_frozen_mask(frozen_bit_mask, frame_size);
+    volk_8u_x3_encodepolar_8u_x2_u_ssse3(
+        frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+    volk_free(temp);
  }
  #endif /* LV_HAVE_SSSE3 */
  
  #ifdef LV_HAVE_AVX2
  static inline void
-volk_8u_x3_encodepolarpuppet_8u_u_avx2(unsigned char* frame, unsigned char* frozen_bit_mask,
-    const unsigned char* frozen_bits, const unsigned char* info_bits,
-    unsigned int frame_size)
+volk_8u_x3_encodepolarpuppet_8u_u_avx2(unsigned char* frame,
+                                       unsigned char* frozen_bit_mask,
+                                       const unsigned char* frozen_bits,
+                                       const unsigned char* info_bits,
+                                       unsigned int frame_size)
  {
-  frame_size = next_lower_power_of_two(frame_size);
-  unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
-  adjust_frozen_mask(frozen_bit_mask, frame_size);
-  volk_8u_x3_encodepolar_8u_x2_u_avx2(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
-  volk_free(temp);
+    frame_size = next_lower_power_of_two(frame_size);
+    unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size,
+                                                      volk_get_alignment());
+    adjust_frozen_mask(frozen_bit_mask, frame_size);
+    volk_8u_x3_encodepolar_8u_x2_u_avx2(
+        frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+    volk_free(temp);
  }
  #endif /* LV_HAVE_AVX2 */
  
@@ -104,29 +115,37 @@ volk_8u_x3_encodepolarpuppet_8u_u_avx2(unsigned char* frame, unsigned char* froz
  
  #ifdef LV_HAVE_SSSE3
  static inline void
-volk_8u_x3_encodepolarpuppet_8u_a_ssse3(unsigned char* frame, unsigned char* frozen_bit_mask,
-    const unsigned char* frozen_bits, const unsigned char* info_bits,
-    unsigned int frame_size)
+volk_8u_x3_encodepolarpuppet_8u_a_ssse3(unsigned char* frame,
+                                        unsigned char* frozen_bit_mask,
+                                        const unsigned char* frozen_bits,
+                                        const unsigned char* info_bits,
+                                        unsigned int frame_size)
  {
-  frame_size = next_lower_power_of_two(frame_size);
-  unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
-  adjust_frozen_mask(frozen_bit_mask, frame_size);
-  volk_8u_x3_encodepolar_8u_x2_a_ssse3(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
-  volk_free(temp);
+    frame_size = next_lower_power_of_two(frame_size);
+    unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size,
+                                                      volk_get_alignment());
+    adjust_frozen_mask(frozen_bit_mask, frame_size);
+    volk_8u_x3_encodepolar_8u_x2_a_ssse3(
+        frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+    volk_free(temp);
  }
  #endif /* LV_HAVE_SSSE3 */
  
  #ifdef LV_HAVE_AVX2
  static inline void
-volk_8u_x3_encodepolarpuppet_8u_a_avx2(unsigned char* frame, unsigned char* frozen_bit_mask,
-    const unsigned char* frozen_bits, const unsigned char* info_bits,
-    unsigned int frame_size)
+volk_8u_x3_encodepolarpuppet_8u_a_avx2(unsigned char* frame,
+                                       unsigned char* frozen_bit_mask,
+                                       const unsigned char* frozen_bits,
+                                       const unsigned char* info_bits,
+                                       unsigned int frame_size)
  {
-  frame_size = next_lower_power_of_two(frame_size);
-  unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
-  adjust_frozen_mask(frozen_bit_mask, frame_size);
-  volk_8u_x3_encodepolar_8u_x2_a_avx2(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
-  volk_free(temp);
+    frame_size = next_lower_power_of_two(frame_size);
+    unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size,
+                                                      volk_get_alignment());
+    adjust_frozen_mask(frozen_bit_mask, frame_size);
+    volk_8u_x3_encodepolar_8u_x2_a_avx2(
+        frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+    volk_free(temp);
  }
  #endif /* LV_HAVE_AVX2 */
  
diff --git a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h

index 029ba756bf135cec209fcc4fb2a95b048e76167c..89460a64b31449df6467ad8bff851b0622c85b35 100644 (file)
--- a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h
+++ b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h
@@ -30,8 +30,9 @@
   *
   * <b>Dispatcher Prototype</b>
   * \code
- * void volk_8u_x4_conv_k7_r2_8u(unsigned char* Y, unsigned char* X, unsigned char* syms, unsigned char* dec, unsigned int framebits, unsigned int excess, unsigned char* Branchtab)
- * \endcode
+ * void volk_8u_x4_conv_k7_r2_8u(unsigned char* Y, unsigned char* X, unsigned char* syms,
+ * unsigned char* dec, unsigned int framebits, unsigned int excess, unsigned char*
+ * Branchtab) \endcode
   *
   * \b Inputs
   * \li X: <FIXME>
@@ -58,67 +59,71 @@
  #define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
  
  typedef union {
-  unsigned char/*DECISIONTYPE*/ t[64/*NUMSTATES*//8/*DECISIONTYPE_BITSIZE*/];
-  unsigned int w[64/*NUMSTATES*//32];
-  unsigned short s[64/*NUMSTATES*//16];
-  unsigned char c[64/*NUMSTATES*//8];
+    unsigned char /*DECISIONTYPE*/ t[64 /*NUMSTATES*/ / 8 /*DECISIONTYPE_BITSIZE*/];
+    unsigned int w[64 /*NUMSTATES*/ / 32];
+    unsigned short s[64 /*NUMSTATES*/ / 16];
+    unsigned char c[64 /*NUMSTATES*/ / 8];
  #ifdef _MSC_VER
  } decision_t;
  #else
-} decision_t __attribute__ ((aligned (16)));
+} decision_t __attribute__((aligned(16)));
  #endif
  
  
-static inline void
-renormalize(unsigned char* X, unsigned char threshold)
+static inline void renormalize(unsigned char* X, unsigned char threshold)
  {
-  int NUMSTATES = 64;
-  int i;
-
-  unsigned char min=X[0];
-  //if(min > threshold) {
-  for(i=0;i<NUMSTATES;i++)
-    if (min>X[i])
-      min=X[i];
-  for(i=0;i<NUMSTATES;i++)
-    X[i]-=min;
-  //}
+    int NUMSTATES = 64;
+    int i;
+
+    unsigned char min = X[0];
+    // if(min > threshold) {
+    for (i = 0; i < NUMSTATES; i++)
+        if (min > X[i])
+            min = X[i];
+    for (i = 0; i < NUMSTATES; i++)
+        X[i] -= min;
+    //}
  }
  
  
-//helper BFLY for GENERIC version
-static inline void
-BFLY(int i, int s, unsigned char * syms, unsigned char *Y,
-     unsigned char *X, decision_t * d, unsigned char* Branchtab)
+// helper BFLY for GENERIC version
+static inline void BFLY(int i,
+                        int s,
+                        unsigned char* syms,
+                        unsigned char* Y,
+                        unsigned char* X,
+                        decision_t* d,
+                        unsigned char* Branchtab)
  {
-  int j, decision0, decision1;
-  unsigned char metric,m0,m1,m2,m3;
+    int j, decision0, decision1;
+    unsigned char metric, m0, m1, m2, m3;
  
-  int NUMSTATES = 64;
-  int RATE = 2;
-  int METRICSHIFT = 1;
-  int PRECISIONSHIFT = 2;
+    int NUMSTATES = 64;
+    int RATE = 2;
+    int METRICSHIFT = 1;
+    int PRECISIONSHIFT = 2;
  
-  metric =0;
-  for(j=0;j<RATE;j++)
-    metric += (Branchtab[i+j*NUMSTATES/2] ^ syms[s*RATE+j])>>METRICSHIFT;
-  metric=metric>>PRECISIONSHIFT;
+    metric = 0;
+    for (j = 0; j < RATE; j++)
+        metric += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]) >> METRICSHIFT;
+    metric = metric >> PRECISIONSHIFT;
  
-  unsigned char max = ((RATE*((256 -1)>>METRICSHIFT))>>PRECISIONSHIFT);
+    unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT);
  
-  m0 = X[i] + metric;
-  m1 = X[i+NUMSTATES/2] + (max - metric);
-  m2 = X[i] + (max - metric);
-  m3 = X[i+NUMSTATES/2] + metric;
+    m0 = X[i] + metric;
+    m1 = X[i + NUMSTATES / 2] + (max - metric);
+    m2 = X[i] + (max - metric);
+    m3 = X[i + NUMSTATES / 2] + metric;
  
-  decision0 = (signed int)(m0-m1) > 0;
-  decision1 = (signed int)(m2-m3) > 0;
+    decision0 = (signed int)(m0 - m1) > 0;
+    decision1 = (signed int)(m2 - m3) > 0;
  
-  Y[2*i] = decision0 ? m1 : m0;
-  Y[2*i+1] =  decision1 ? m3 : m2;
+    Y[2 * i] = decision0 ? m1 : m0;
+    Y[2 * i + 1] = decision1 ? m3 : m2;
  
-  d->w[i/(sizeof(unsigned int)*8/2)+s*(sizeof(decision_t)/sizeof(unsigned int))] |=
-    (decision0|decision1<<1) << ((2*i)&(sizeof(unsigned int)*8-1));
+    d->w[i / (sizeof(unsigned int) * 8 / 2) +
+         s * (sizeof(decision_t) / sizeof(unsigned int))] |=
+        (decision0 | decision1 << 1) << ((2 * i) & (sizeof(unsigned int) * 8 - 1));
  }
  
  
@@ -127,188 +132,199 @@ BFLY(int i, int s, unsigned char * syms, unsigned char *Y,
  #include <immintrin.h>
  #include <stdio.h>
  
-static inline void
-volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, unsigned char* X,
-                                unsigned char* syms, unsigned char* dec,
-                                unsigned int framebits, unsigned int excess,
-                                unsigned char* Branchtab)
+static inline void volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y,
+                                                 unsigned char* X,
+                                                 unsigned char* syms,
+                                                 unsigned char* dec,
+                                                 unsigned int framebits,
+                                                 unsigned int excess,
+                                                 unsigned char* Branchtab)
  {
-  unsigned int i9;
-  for(i9 = 0; i9 < ((framebits + excess)>>1); i9++) {
-    unsigned char a75, a81;
-    int a73, a92;
-    int s20, s21;
-    unsigned char  *a80, *b6;
-    int  *a110, *a91, *a93;
-    __m256i  *a112, *a71, *a72, *a77, *a83, *a95;
-    __m256i a86, a87;
-    __m256i a76, a78, a79, a82, a84, a85, a88, a89
-      , a90, d10, d9, m23, m24, m25
-      , m26, s18, s19, s22
-      , s23, s24, s25, t13, t14, t15;
-    a71 = ((__m256i  *) X);
-    s18 = *(a71);
-    a72 = (a71 + 1);
-    s19 = *(a72);
-    s22 = _mm256_permute2x128_si256(s18,s19,0x20);
-    s19 = _mm256_permute2x128_si256(s18,s19,0x31);
-    s18 = s22;
-    a73 = (4 * i9);
-    b6 = (syms + a73);
-    a75 = *(b6);
-    a76 = _mm256_set1_epi8(a75);
-    a77 = ((__m256i  *) Branchtab);
-    a78 = *(a77);
-    a79 = _mm256_xor_si256(a76, a78);
-    a80 = (b6 + 1);
-    a81 = *(a80);
-    a82 = _mm256_set1_epi8(a81);
-    a83 = (a77 + 1);
-    a84 = *(a83);
-    a85 = _mm256_xor_si256(a82, a84);
-    t13 = _mm256_avg_epu8(a79,a85);
-    a86 = ((__m256i ) t13);
-    a87 = _mm256_srli_epi16(a86, 2);
-    a88 = ((__m256i ) a87);
-    t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
-    t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
-    m23 = _mm256_adds_epu8(s18, t14);
-    m24 = _mm256_adds_epu8(s19, t15);
-    m25 = _mm256_adds_epu8(s18, t15);
-    m26 = _mm256_adds_epu8(s19, t14);
-    a89 = _mm256_min_epu8(m24, m23);
-    d9 = _mm256_cmpeq_epi8(a89, m24);
-    a90 = _mm256_min_epu8(m26, m25);
-    d10 = _mm256_cmpeq_epi8(a90, m26);
-    s22 = _mm256_unpacklo_epi8(d9,d10);
-    s23 = _mm256_unpackhi_epi8(d9,d10);
-    s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
-    a91 = ((int  *) dec);
-    a92 = (4 * i9);
-    a93 = (a91 + a92);
-    *(a93) = s20;
-    s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
-    a110 = (a93 + 1);
-    *(a110) = s21;
-    s22 = _mm256_unpacklo_epi8(a89, a90);
-    s23 = _mm256_unpackhi_epi8(a89, a90);
-    a95 = ((__m256i  *) Y);
-    s24 = _mm256_permute2x128_si256(s22, s23, 0x20);
-    *(a95) = s24;
-    s23 = _mm256_permute2x128_si256(s22, s23, 0x31);
-    a112 = (a95 + 1);
-    *(a112) = s23;
-    if ((((unsigned char  *) Y)[0]>210)) {
-      __m256i m5, m6;
-      m5 = ((__m256i  *) Y)[0];
-      m5 = _mm256_min_epu8(m5, ((__m256i  *) Y)[1]);
-      __m256i m7;
-      m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
-      m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 32)), ((__m256i ) m7)));
-      m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 16)), ((__m256i ) m7)));
-      m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 8)), ((__m256i ) m7)));
-      m7 = _mm256_unpacklo_epi8(m7, m7);
-      m7 = _mm256_shufflelo_epi16(m7, 0);
-      m6 = _mm256_unpacklo_epi64(m7, m7);
-      m6 = _mm256_permute2x128_si256(m6, m6, 0); //copy lower half of m6 to upper half, since above ops operate on 128 bit lanes
-      ((__m256i  *) Y)[0] = _mm256_subs_epu8(((__m256i  *) Y)[0], m6);
-      ((__m256i  *) Y)[1] = _mm256_subs_epu8(((__m256i  *) Y)[1], m6);
+    unsigned int i9;
+    for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
+        unsigned char a75, a81;
+        int a73, a92;
+        int s20, s21;
+        unsigned char *a80, *b6;
+        int *a110, *a91, *a93;
+        __m256i *a112, *a71, *a72, *a77, *a83, *a95;
+        __m256i a86, a87;
+        __m256i a76, a78, a79, a82, a84, a85, a88, a89, a90, d10, d9, m23, m24, m25, m26,
+            s18, s19, s22, s23, s24, s25, t13, t14, t15;
+        a71 = ((__m256i*)X);
+        s18 = *(a71);
+        a72 = (a71 + 1);
+        s19 = *(a72);
+        s22 = _mm256_permute2x128_si256(s18, s19, 0x20);
+        s19 = _mm256_permute2x128_si256(s18, s19, 0x31);
+        s18 = s22;
+        a73 = (4 * i9);
+        b6 = (syms + a73);
+        a75 = *(b6);
+        a76 = _mm256_set1_epi8(a75);
+        a77 = ((__m256i*)Branchtab);
+        a78 = *(a77);
+        a79 = _mm256_xor_si256(a76, a78);
+        a80 = (b6 + 1);
+        a81 = *(a80);
+        a82 = _mm256_set1_epi8(a81);
+        a83 = (a77 + 1);
+        a84 = *(a83);
+        a85 = _mm256_xor_si256(a82, a84);
+        t13 = _mm256_avg_epu8(a79, a85);
+        a86 = ((__m256i)t13);
+        a87 = _mm256_srli_epi16(a86, 2);
+        a88 = ((__m256i)a87);
+        t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
+        t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
+        m23 = _mm256_adds_epu8(s18, t14);
+        m24 = _mm256_adds_epu8(s19, t15);
+        m25 = _mm256_adds_epu8(s18, t15);
+        m26 = _mm256_adds_epu8(s19, t14);
+        a89 = _mm256_min_epu8(m24, m23);
+        d9 = _mm256_cmpeq_epi8(a89, m24);
+        a90 = _mm256_min_epu8(m26, m25);
+        d10 = _mm256_cmpeq_epi8(a90, m26);
+        s22 = _mm256_unpacklo_epi8(d9, d10);
+        s23 = _mm256_unpackhi_epi8(d9, d10);
+        s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
+        a91 = ((int*)dec);
+        a92 = (4 * i9);
+        a93 = (a91 + a92);
+        *(a93) = s20;
+        s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
+        a110 = (a93 + 1);
+        *(a110) = s21;
+        s22 = _mm256_unpacklo_epi8(a89, a90);
+        s23 = _mm256_unpackhi_epi8(a89, a90);
+        a95 = ((__m256i*)Y);
+        s24 = _mm256_permute2x128_si256(s22, s23, 0x20);
+        *(a95) = s24;
+        s23 = _mm256_permute2x128_si256(s22, s23, 0x31);
+        a112 = (a95 + 1);
+        *(a112) = s23;
+        if ((((unsigned char*)Y)[0] > 210)) {
+            __m256i m5, m6;
+            m5 = ((__m256i*)Y)[0];
+            m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]);
+            __m256i m7;
+            m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
+            m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)),
+                                           ((__m256i)m7)));
+            m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)),
+                                           ((__m256i)m7)));
+            m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)),
+                                           ((__m256i)m7)));
+            m7 = _mm256_unpacklo_epi8(m7, m7);
+            m7 = _mm256_shufflelo_epi16(m7, 0);
+            m6 = _mm256_unpacklo_epi64(m7, m7);
+            m6 = _mm256_permute2x128_si256(
+                m6, m6, 0); // copy lower half of m6 to upper half, since above ops
+                            // operate on 128 bit lanes
+            ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6);
+            ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6);
+        }
+        unsigned char a188, a194;
+        int a205;
+        int s48, s54;
+        unsigned char *a187, *a193;
+        int *a204, *a206, *a223, *b16;
+        __m256i *a184, *a185, *a190, *a196, *a208, *a225;
+        __m256i a199, a200;
+        __m256i a189, a191, a192, a195, a197, a198, a201, a202, a203, d17, d18, m39, m40,
+            m41, m42, s46, s47, s50, s51, t25, t26, t27;
+        a184 = ((__m256i*)Y);
+        s46 = *(a184);
+        a185 = (a184 + 1);
+        s47 = *(a185);
+        s50 = _mm256_permute2x128_si256(s46, s47, 0x20);
+        s47 = _mm256_permute2x128_si256(s46, s47, 0x31);
+        s46 = s50;
+        a187 = (b6 + 2);
+        a188 = *(a187);
+        a189 = _mm256_set1_epi8(a188);
+        a190 = ((__m256i*)Branchtab);
+        a191 = *(a190);
+        a192 = _mm256_xor_si256(a189, a191);
+        a193 = (b6 + 3);
+        a194 = *(a193);
+        a195 = _mm256_set1_epi8(a194);
+        a196 = (a190 + 1);
+        a197 = *(a196);
+        a198 = _mm256_xor_si256(a195, a197);
+        t25 = _mm256_avg_epu8(a192, a198);
+        a199 = ((__m256i)t25);
+        a200 = _mm256_srli_epi16(a199, 2);
+        a201 = ((__m256i)a200);
+        t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63));
+        t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26);
+        m39 = _mm256_adds_epu8(s46, t26);
+        m40 = _mm256_adds_epu8(s47, t27);
+        m41 = _mm256_adds_epu8(s46, t27);
+        m42 = _mm256_adds_epu8(s47, t26);
+        a202 = _mm256_min_epu8(m40, m39);
+        d17 = _mm256_cmpeq_epi8(a202, m40);
+        a203 = _mm256_min_epu8(m42, m41);
+        d18 = _mm256_cmpeq_epi8(a203, m42);
+        s24 = _mm256_unpacklo_epi8(d17, d18);
+        s25 = _mm256_unpackhi_epi8(d17, d18);
+        s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
+        a204 = ((int*)dec);
+        a205 = (4 * i9);
+        b16 = (a204 + a205);
+        a206 = (b16 + 2);
+        *(a206) = s48;
+        s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
+        a223 = (b16 + 3);
+        *(a223) = s54;
+        s50 = _mm256_unpacklo_epi8(a202, a203);
+        s51 = _mm256_unpackhi_epi8(a202, a203);
+        s25 = _mm256_permute2x128_si256(s50, s51, 0x20);
+        s51 = _mm256_permute2x128_si256(s50, s51, 0x31);
+        a208 = ((__m256i*)X);
+        *(a208) = s25;
+        a225 = (a208 + 1);
+        *(a225) = s51;
+
+        if ((((unsigned char*)X)[0] > 210)) {
+            __m256i m12, m13;
+            m12 = ((__m256i*)X)[0];
+            m12 = _mm256_min_epu8(m12, ((__m256i*)X)[1]);
+            __m256i m14;
+            m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12);
+            m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 32)),
+                                            ((__m256i)m14)));
+            m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 16)),
+                                            ((__m256i)m14)));
+            m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 8)),
+                                            ((__m256i)m14)));
+            m14 = _mm256_unpacklo_epi8(m14, m14);
+            m14 = _mm256_shufflelo_epi16(m14, 0);
+            m13 = _mm256_unpacklo_epi64(m14, m14);
+            m13 = _mm256_permute2x128_si256(m13, m13, 0);
+            ((__m256i*)X)[0] = _mm256_subs_epu8(((__m256i*)X)[0], m13);
+            ((__m256i*)X)[1] = _mm256_subs_epu8(((__m256i*)X)[1], m13);
+        }
      }
-    unsigned char a188, a194;
-    int a205;
-    int s48, s54;
-    unsigned char  *a187, *a193;
-    int  *a204, *a206, *a223, *b16;
-    __m256i  *a184, *a185, *a190, *a196, *a208, *a225;
-    __m256i a199, a200;
-    __m256i a189, a191, a192, a195, a197, a198, a201
-      , a202, a203, d17, d18, m39, m40, m41
-      , m42, s46, s47, s50
-      , s51, t25, t26, t27;
-    a184 = ((__m256i  *) Y);
-    s46 = *(a184);
-    a185 = (a184 + 1);
-    s47 = *(a185);
-    s50 = _mm256_permute2x128_si256(s46,s47,0x20);
-    s47 = _mm256_permute2x128_si256(s46,s47,0x31);
-    s46 = s50;
-    a187 = (b6 + 2);
-    a188 = *(a187);
-    a189 = _mm256_set1_epi8(a188);
-    a190 = ((__m256i  *) Branchtab);
-    a191 = *(a190);
-    a192 = _mm256_xor_si256(a189, a191);
-    a193 = (b6 + 3);
-    a194 = *(a193);
-    a195 = _mm256_set1_epi8(a194);
-    a196 = (a190 + 1);
-    a197 = *(a196);
-    a198 = _mm256_xor_si256(a195, a197);
-    t25 = _mm256_avg_epu8(a192,a198);
-    a199 = ((__m256i ) t25);
-    a200 = _mm256_srli_epi16(a199, 2);
-    a201 = ((__m256i ) a200);
-    t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63));
-    t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26);
-    m39 = _mm256_adds_epu8(s46, t26);
-    m40 = _mm256_adds_epu8(s47, t27);
-    m41 = _mm256_adds_epu8(s46, t27);
-    m42 = _mm256_adds_epu8(s47, t26);
-    a202 = _mm256_min_epu8(m40, m39);
-    d17 = _mm256_cmpeq_epi8(a202, m40);
-    a203 = _mm256_min_epu8(m42, m41);
-    d18 = _mm256_cmpeq_epi8(a203, m42);
-    s24 = _mm256_unpacklo_epi8(d17,d18);
-    s25 = _mm256_unpackhi_epi8(d17,d18);
-    s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
-    a204 = ((int  *) dec);
-    a205 = (4 * i9);
-    b16 = (a204 + a205);
-    a206 = (b16 + 2);
-    *(a206) = s48;
-    s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
-    a223 = (b16 + 3);
-    *(a223) = s54;
-    s50 = _mm256_unpacklo_epi8(a202, a203);
-    s51 = _mm256_unpackhi_epi8(a202, a203);
-    s25 = _mm256_permute2x128_si256(s50, s51, 0x20);
-    s51 = _mm256_permute2x128_si256(s50, s51, 0x31);
-    a208 = ((__m256i  *) X);
-    *(a208) = s25;
-    a225 = (a208 + 1);
-    *(a225) = s51;
-
-    if ((((unsigned char  *) X)[0]>210)) {
-      __m256i m12, m13;
-      m12 = ((__m256i  *) X)[0];
-      m12 = _mm256_min_epu8(m12, ((__m256i  *) X)[1]);
-      __m256i m14;
-      m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12);
-      m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 32)), ((__m256i ) m14)));
-      m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 16)), ((__m256i ) m14)));
-      m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 8)), ((__m256i ) m14)));
-      m14 = _mm256_unpacklo_epi8(m14, m14);
-      m14 = _mm256_shufflelo_epi16(m14, 0);
-      m13 = _mm256_unpacklo_epi64(m14, m14);
-      m13 = _mm256_permute2x128_si256(m13, m13, 0);
-      ((__m256i  *) X)[0] = _mm256_subs_epu8(((__m256i  *) X)[0], m13);
-      ((__m256i  *) X)[1] = _mm256_subs_epu8(((__m256i  *) X)[1], m13);
-    }
-  }
-
-  renormalize(X, 210);
  
-  unsigned int j;
-  for(j=0; j < (framebits + excess) % 2; ++j) {
-    int i;
-    for(i=0;i<64/2;i++){
-      BFLY(i, (((framebits+excess) >> 1) << 1) + j , syms, Y, X, (decision_t *)dec, Branchtab);
+    renormalize(X, 210);
+
+    unsigned int j;
+    for (j = 0; j < (framebits + excess) % 2; ++j) {
+        int i;
+        for (i = 0; i < 64 / 2; i++) {
+            BFLY(i,
+                 (((framebits + excess) >> 1) << 1) + j,
+                 syms,
+                 Y,
+                 X,
+                 (decision_t*)dec,
+                 Branchtab);
+        }
+
+        renormalize(Y, 210);
      }
-
-    renormalize(Y, 210);
-
-  }
-  /*skip*/
+    /*skip*/
  }
  
  #endif /*LV_HAVE_AVX2*/
@@ -316,295 +332,300 @@ volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, unsigned char* X,
  
  #if LV_HAVE_SSE3
  
-#include <pmmintrin.h>
  #include <emmintrin.h>
-#include <xmmintrin.h>
  #include <mmintrin.h>
+#include <pmmintrin.h>
  #include <stdio.h>
+#include <xmmintrin.h>
  
-static inline void
-volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y, unsigned char* X,
-                                unsigned char* syms, unsigned char* dec,
-                                unsigned int framebits, unsigned int excess,
-                                unsigned char* Branchtab)
+static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y,
+                                                   unsigned char* X,
+                                                   unsigned char* syms,
+                                                   unsigned char* dec,
+                                                   unsigned int framebits,
+                                                   unsigned int excess,
+                                                   unsigned char* Branchtab)
  {
-  unsigned int i9;
-  for(i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
-    unsigned char a75, a81;
-    int a73, a92;
-    short int s20, s21, s26, s27;
-    unsigned char  *a74, *a80, *b6;
-    short int  *a110, *a111, *a91, *a93, *a94;
-    __m128i  *a102, *a112, *a113, *a71, *a72, *a77, *a83
-      , *a95, *a96, *a97, *a98, *a99;
-    __m128i a105, a106, a86, a87;
-    __m128i a100, a101, a103, a104, a107, a108, a109
-      , a76, a78, a79, a82, a84, a85, a88, a89
-      , a90, d10, d11, d12, d9, m23, m24, m25
-      , m26, m27, m28, m29, m30, s18, s19, s22
-      , s23, s24, s25, s28, s29, t13, t14, t15
-      , t16, t17, t18;
-    a71 = ((__m128i  *) X);
-    s18 = *(a71);
-    a72 = (a71 + 2);
-    s19 = *(a72);
-    a73 = (4 * i9);
-    a74 = (syms + a73);
-    a75 = *(a74);
-    a76 = _mm_set1_epi8(a75);
-    a77 = ((__m128i  *) Branchtab);
-    a78 = *(a77);
-    a79 = _mm_xor_si128(a76, a78);
-    b6 = (a73 + syms);
-    a80 = (b6 + 1);
-    a81 = *(a80);
-    a82 = _mm_set1_epi8(a81);
-    a83 = (a77 + 2);
-    a84 = *(a83);
-    a85 = _mm_xor_si128(a82, a84);
-    t13 = _mm_avg_epu8(a79,a85);
-    a86 = ((__m128i ) t13);
-    a87 = _mm_srli_epi16(a86, 2);
-    a88 = ((__m128i ) a87);
-    t14 = _mm_and_si128(a88, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
-                                         , 63, 63, 63, 63, 63, 63, 63, 63
-                                         , 63));
-    t15 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
-                                    , 63, 63, 63, 63, 63, 63, 63, 63
-                                    , 63), t14);
-    m23 = _mm_adds_epu8(s18, t14);
-    m24 = _mm_adds_epu8(s19, t15);
-    m25 = _mm_adds_epu8(s18, t15);
-    m26 = _mm_adds_epu8(s19, t14);
-    a89 = _mm_min_epu8(m24, m23);
-    d9 = _mm_cmpeq_epi8(a89, m24);
-    a90 = _mm_min_epu8(m26, m25);
-    d10 = _mm_cmpeq_epi8(a90, m26);
-    s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9,d10));
-    a91 = ((short int  *) dec);
-    a92 = (8 * i9);
-    a93 = (a91 + a92);
-    *(a93) = s20;
-    s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9,d10));
-    a94 = (a93 + 1);
-    *(a94) = s21;
-    s22 = _mm_unpacklo_epi8(a89, a90);
-    s23 = _mm_unpackhi_epi8(a89, a90);
-    a95 = ((__m128i  *) Y);
-    *(a95) = s22;
-    a96 = (a95 + 1);
-    *(a96) = s23;
-    a97 = (a71 + 1);
-    s24 = *(a97);
-    a98 = (a71 + 3);
-    s25 = *(a98);
-    a99 = (a77 + 1);
-    a100 = *(a99);
-    a101 = _mm_xor_si128(a76, a100);
-    a102 = (a77 + 3);
-    a103 = *(a102);
-    a104 = _mm_xor_si128(a82, a103);
-    t16 = _mm_avg_epu8(a101,a104);
-    a105 = ((__m128i ) t16);
-    a106 = _mm_srli_epi16(a105, 2);
-    a107 = ((__m128i ) a106);
-    t17 = _mm_and_si128(a107, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
-                                          , 63, 63, 63, 63, 63, 63, 63, 63
-                                          , 63));
-    t18 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
-                                    , 63, 63, 63, 63, 63, 63, 63, 63
-                                    , 63), t17);
-    m27 = _mm_adds_epu8(s24, t17);
-    m28 = _mm_adds_epu8(s25, t18);
-    m29 = _mm_adds_epu8(s24, t18);
-    m30 = _mm_adds_epu8(s25, t17);
-    a108 = _mm_min_epu8(m28, m27);
-    d11 = _mm_cmpeq_epi8(a108, m28);
-    a109 = _mm_min_epu8(m30, m29);
-    d12 = _mm_cmpeq_epi8(a109, m30);
-    s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11,d12));
-    a110 = (a93 + 2);
-    *(a110) = s26;
-    s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11,d12));
-    a111 = (a93 + 3);
-    *(a111) = s27;
-    s28 = _mm_unpacklo_epi8(a108, a109);
-    s29 = _mm_unpackhi_epi8(a108, a109);
-    a112 = (a95 + 2);
-    *(a112) = s28;
-    a113 = (a95 + 3);
-    *(a113) = s29;
-    if ((((unsigned char  *) Y)[0]>210)) {
-      __m128i m5, m6;
-      m5 = ((__m128i  *) Y)[0];
-      m5 = _mm_min_epu8(m5, ((__m128i  *) Y)[1]);
-      m5 = _mm_min_epu8(m5, ((__m128i  *) Y)[2]);
-      m5 = _mm_min_epu8(m5, ((__m128i  *) Y)[3]);
-      __m128i m7;
-      m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
-      m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 32)), ((__m128i ) m7)));
-      m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 16)), ((__m128i ) m7)));
-      m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 8)), ((__m128i ) m7)));
-      m7 = _mm_unpacklo_epi8(m7, m7);
-      m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
-      m6 = _mm_unpacklo_epi64(m7, m7);
-      ((__m128i  *) Y)[0] = _mm_subs_epu8(((__m128i  *) Y)[0], m6);
-      ((__m128i  *) Y)[1] = _mm_subs_epu8(((__m128i  *) Y)[1], m6);
-      ((__m128i  *) Y)[2] = _mm_subs_epu8(((__m128i  *) Y)[2], m6);
-      ((__m128i  *) Y)[3] = _mm_subs_epu8(((__m128i  *) Y)[3], m6);
-    }
-    unsigned char a188, a194;
-    int a186, a205;
-    short int s48, s49, s54, s55;
-    unsigned char  *a187, *a193, *b15;
-    short int  *a204, *a206, *a207, *a223, *a224, *b16;
-    __m128i  *a184, *a185, *a190, *a196, *a208, *a209, *a210
-      , *a211, *a212, *a215, *a225, *a226;
-    __m128i a199, a200, a218, a219;
-    __m128i a189, a191, a192, a195, a197, a198, a201
-      , a202, a203, a213, a214, a216, a217, a220, a221
-      , a222, d17, d18, d19, d20, m39, m40, m41
-      , m42, m43, m44, m45, m46, s46, s47, s50
-      , s51, s52, s53, s56, s57, t25, t26, t27
-      , t28, t29, t30;
-    a184 = ((__m128i  *) Y);
-    s46 = *(a184);
-    a185 = (a184 + 2);
-    s47 = *(a185);
-    a186 = (4 * i9);
-    b15 = (a186 + syms);
-    a187 = (b15 + 2);
-    a188 = *(a187);
-    a189 = _mm_set1_epi8(a188);
-    a190 = ((__m128i  *) Branchtab);
-    a191 = *(a190);
-    a192 = _mm_xor_si128(a189, a191);
-    a193 = (b15 + 3);
-    a194 = *(a193);
-    a195 = _mm_set1_epi8(a194);
-    a196 = (a190 + 2);
-    a197 = *(a196);
-    a198 = _mm_xor_si128(a195, a197);
-    t25 = _mm_avg_epu8(a192,a198);
-    a199 = ((__m128i ) t25);
-    a200 = _mm_srli_epi16(a199, 2);
-    a201 = ((__m128i ) a200);
-    t26 = _mm_and_si128(a201, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
-                                          , 63, 63, 63, 63, 63, 63, 63, 63
-                                          , 63));
-    t27 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
-                                    , 63, 63, 63, 63, 63, 63, 63, 63
-                                    , 63), t26);
-    m39 = _mm_adds_epu8(s46, t26);
-    m40 = _mm_adds_epu8(s47, t27);
-    m41 = _mm_adds_epu8(s46, t27);
-    m42 = _mm_adds_epu8(s47, t26);
-    a202 = _mm_min_epu8(m40, m39);
-    d17 = _mm_cmpeq_epi8(a202, m40);
-    a203 = _mm_min_epu8(m42, m41);
-    d18 = _mm_cmpeq_epi8(a203, m42);
-    s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17,d18));
-    a204 = ((short int  *) dec);
-    a205 = (8 * i9);
-    b16 = (a204 + a205);
-    a206 = (b16 + 4);
-    *(a206) = s48;
-    s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17,d18));
-    a207 = (b16 + 5);
-    *(a207) = s49;
-    s50 = _mm_unpacklo_epi8(a202, a203);
-    s51 = _mm_unpackhi_epi8(a202, a203);
-    a208 = ((__m128i  *) X);
-    *(a208) = s50;
-    a209 = (a208 + 1);
-    *(a209) = s51;
-    a210 = (a184 + 1);
-    s52 = *(a210);
-    a211 = (a184 + 3);
-    s53 = *(a211);
-    a212 = (a190 + 1);
-    a213 = *(a212);
-    a214 = _mm_xor_si128(a189, a213);
-    a215 = (a190 + 3);
-    a216 = *(a215);
-    a217 = _mm_xor_si128(a195, a216);
-    t28 = _mm_avg_epu8(a214,a217);
-    a218 = ((__m128i ) t28);
-    a219 = _mm_srli_epi16(a218, 2);
-    a220 = ((__m128i ) a219);
-    t29 = _mm_and_si128(a220, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
-                                          , 63, 63, 63, 63, 63, 63, 63, 63
-                                          , 63));
-    t30 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
-                                    , 63, 63, 63, 63, 63, 63, 63, 63
-                                    , 63), t29);
-    m43 = _mm_adds_epu8(s52, t29);
-    m44 = _mm_adds_epu8(s53, t30);
-    m45 = _mm_adds_epu8(s52, t30);
-    m46 = _mm_adds_epu8(s53, t29);
-    a221 = _mm_min_epu8(m44, m43);
-    d19 = _mm_cmpeq_epi8(a221, m44);
-    a222 = _mm_min_epu8(m46, m45);
-    d20 = _mm_cmpeq_epi8(a222, m46);
-    s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19,d20));
-    a223 = (b16 + 6);
-    *(a223) = s54;
-    s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19,d20));
-    a224 = (b16 + 7);
-    *(a224) = s55;
-    s56 = _mm_unpacklo_epi8(a221, a222);
-    s57 = _mm_unpackhi_epi8(a221, a222);
-    a225 = (a208 + 2);
-    *(a225) = s56;
-    a226 = (a208 + 3);
-    *(a226) = s57;
-    if ((((unsigned char  *) X)[0]>210)) {
-      __m128i m12, m13;
-      m12 = ((__m128i  *) X)[0];
-      m12 = _mm_min_epu8(m12, ((__m128i  *) X)[1]);
-      m12 = _mm_min_epu8(m12, ((__m128i  *) X)[2]);
-      m12 = _mm_min_epu8(m12, ((__m128i  *) X)[3]);
-      __m128i m14;
-      m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
-      m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 32)), ((__m128i ) m14)));
-      m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 16)), ((__m128i ) m14)));
-      m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 8)), ((__m128i ) m14)));
-      m14 = _mm_unpacklo_epi8(m14, m14);
-      m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
-      m13 = _mm_unpacklo_epi64(m14, m14);
-      ((__m128i  *) X)[0] = _mm_subs_epu8(((__m128i  *) X)[0], m13);
-      ((__m128i  *) X)[1] = _mm_subs_epu8(((__m128i  *) X)[1], m13);
-      ((__m128i  *) X)[2] = _mm_subs_epu8(((__m128i  *) X)[2], m13);
-      ((__m128i  *) X)[3] = _mm_subs_epu8(((__m128i  *) X)[3], m13);
+    unsigned int i9;
+    for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
+        unsigned char a75, a81;
+        int a73, a92;
+        short int s20, s21, s26, s27;
+        unsigned char *a74, *a80, *b6;
+        short int *a110, *a111, *a91, *a93, *a94;
+        __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
+        __m128i a105, a106, a86, a87;
+        __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
+            a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
+            s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
+        a71 = ((__m128i*)X);
+        s18 = *(a71);
+        a72 = (a71 + 2);
+        s19 = *(a72);
+        a73 = (4 * i9);
+        a74 = (syms + a73);
+        a75 = *(a74);
+        a76 = _mm_set1_epi8(a75);
+        a77 = ((__m128i*)Branchtab);
+        a78 = *(a77);
+        a79 = _mm_xor_si128(a76, a78);
+        b6 = (a73 + syms);
+        a80 = (b6 + 1);
+        a81 = *(a80);
+        a82 = _mm_set1_epi8(a81);
+        a83 = (a77 + 2);
+        a84 = *(a83);
+        a85 = _mm_xor_si128(a82, a84);
+        t13 = _mm_avg_epu8(a79, a85);
+        a86 = ((__m128i)t13);
+        a87 = _mm_srli_epi16(a86, 2);
+        a88 = ((__m128i)a87);
+        t14 = _mm_and_si128(
+            a88,
+            _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
+        t15 = _mm_subs_epu8(
+            _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
+            t14);
+        m23 = _mm_adds_epu8(s18, t14);
+        m24 = _mm_adds_epu8(s19, t15);
+        m25 = _mm_adds_epu8(s18, t15);
+        m26 = _mm_adds_epu8(s19, t14);
+        a89 = _mm_min_epu8(m24, m23);
+        d9 = _mm_cmpeq_epi8(a89, m24);
+        a90 = _mm_min_epu8(m26, m25);
+        d10 = _mm_cmpeq_epi8(a90, m26);
+        s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
+        a91 = ((short int*)dec);
+        a92 = (8 * i9);
+        a93 = (a91 + a92);
+        *(a93) = s20;
+        s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
+        a94 = (a93 + 1);
+        *(a94) = s21;
+        s22 = _mm_unpacklo_epi8(a89, a90);
+        s23 = _mm_unpackhi_epi8(a89, a90);
+        a95 = ((__m128i*)Y);
+        *(a95) = s22;
+        a96 = (a95 + 1);
+        *(a96) = s23;
+        a97 = (a71 + 1);
+        s24 = *(a97);
+        a98 = (a71 + 3);
+        s25 = *(a98);
+        a99 = (a77 + 1);
+        a100 = *(a99);
+        a101 = _mm_xor_si128(a76, a100);
+        a102 = (a77 + 3);
+        a103 = *(a102);
+        a104 = _mm_xor_si128(a82, a103);
+        t16 = _mm_avg_epu8(a101, a104);
+        a105 = ((__m128i)t16);
+        a106 = _mm_srli_epi16(a105, 2);
+        a107 = ((__m128i)a106);
+        t17 = _mm_and_si128(
+            a107,
+            _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
+        t18 = _mm_subs_epu8(
+            _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
+            t17);
+        m27 = _mm_adds_epu8(s24, t17);
+        m28 = _mm_adds_epu8(s25, t18);
+        m29 = _mm_adds_epu8(s24, t18);
+        m30 = _mm_adds_epu8(s25, t17);
+        a108 = _mm_min_epu8(m28, m27);
+        d11 = _mm_cmpeq_epi8(a108, m28);
+        a109 = _mm_min_epu8(m30, m29);
+        d12 = _mm_cmpeq_epi8(a109, m30);
+        s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
+        a110 = (a93 + 2);
+        *(a110) = s26;
+        s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
+        a111 = (a93 + 3);
+        *(a111) = s27;
+        s28 = _mm_unpacklo_epi8(a108, a109);
+        s29 = _mm_unpackhi_epi8(a108, a109);
+        a112 = (a95 + 2);
+        *(a112) = s28;
+        a113 = (a95 + 3);
+        *(a113) = s29;
+        if ((((unsigned char*)Y)[0] > 210)) {
+            __m128i m5, m6;
+            m5 = ((__m128i*)Y)[0];
+            m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
+            m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
+            m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
+            __m128i m7;
+            m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
+            m7 =
+                ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
+            m7 =
+                ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
+            m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
+            m7 = _mm_unpacklo_epi8(m7, m7);
+            m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
+            m6 = _mm_unpacklo_epi64(m7, m7);
+            ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
+            ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
+            ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
+            ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
+        }
+        unsigned char a188, a194;
+        int a186, a205;
+        short int s48, s49, s54, s55;
+        unsigned char *a187, *a193, *b15;
+        short int *a204, *a206, *a207, *a223, *a224, *b16;
+        __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
+            *a225, *a226;
+        __m128i a199, a200, a218, a219;
+        __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
+            a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45,
+            m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30;
+        a184 = ((__m128i*)Y);
+        s46 = *(a184);
+        a185 = (a184 + 2);
+        s47 = *(a185);
+        a186 = (4 * i9);
+        b15 = (a186 + syms);
+        a187 = (b15 + 2);
+        a188 = *(a187);
+        a189 = _mm_set1_epi8(a188);
+        a190 = ((__m128i*)Branchtab);
+        a191 = *(a190);
+        a192 = _mm_xor_si128(a189, a191);
+        a193 = (b15 + 3);
+        a194 = *(a193);
+        a195 = _mm_set1_epi8(a194);
+        a196 = (a190 + 2);
+        a197 = *(a196);
+        a198 = _mm_xor_si128(a195, a197);
+        t25 = _mm_avg_epu8(a192, a198);
+        a199 = ((__m128i)t25);
+        a200 = _mm_srli_epi16(a199, 2);
+        a201 = ((__m128i)a200);
+        t26 = _mm_and_si128(
+            a201,
+            _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
+        t27 = _mm_subs_epu8(
+            _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
+            t26);
+        m39 = _mm_adds_epu8(s46, t26);
+        m40 = _mm_adds_epu8(s47, t27);
+        m41 = _mm_adds_epu8(s46, t27);
+        m42 = _mm_adds_epu8(s47, t26);
+        a202 = _mm_min_epu8(m40, m39);
+        d17 = _mm_cmpeq_epi8(a202, m40);
+        a203 = _mm_min_epu8(m42, m41);
+        d18 = _mm_cmpeq_epi8(a203, m42);
+        s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18));
+        a204 = ((short int*)dec);
+        a205 = (8 * i9);
+        b16 = (a204 + a205);
+        a206 = (b16 + 4);
+        *(a206) = s48;
+        s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18));
+        a207 = (b16 + 5);
+        *(a207) = s49;
+        s50 = _mm_unpacklo_epi8(a202, a203);
+        s51 = _mm_unpackhi_epi8(a202, a203);
+        a208 = ((__m128i*)X);
+        *(a208) = s50;
+        a209 = (a208 + 1);
+        *(a209) = s51;
+        a210 = (a184 + 1);
+        s52 = *(a210);
+        a211 = (a184 + 3);
+        s53 = *(a211);
+        a212 = (a190 + 1);
+        a213 = *(a212);
+        a214 = _mm_xor_si128(a189, a213);
+        a215 = (a190 + 3);
+        a216 = *(a215);
+        a217 = _mm_xor_si128(a195, a216);
+        t28 = _mm_avg_epu8(a214, a217);
+        a218 = ((__m128i)t28);
+        a219 = _mm_srli_epi16(a218, 2);
+        a220 = ((__m128i)a219);
+        t29 = _mm_and_si128(
+            a220,
+            _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
+        t30 = _mm_subs_epu8(
+            _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
+            t29);
+        m43 = _mm_adds_epu8(s52, t29);
+        m44 = _mm_adds_epu8(s53, t30);
+        m45 = _mm_adds_epu8(s52, t30);
+        m46 = _mm_adds_epu8(s53, t29);
+        a221 = _mm_min_epu8(m44, m43);
+        d19 = _mm_cmpeq_epi8(a221, m44);
+        a222 = _mm_min_epu8(m46, m45);
+        d20 = _mm_cmpeq_epi8(a222, m46);
+        s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20));
+        a223 = (b16 + 6);
+        *(a223) = s54;
+        s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20));
+        a224 = (b16 + 7);
+        *(a224) = s55;
+        s56 = _mm_unpacklo_epi8(a221, a222);
+        s57 = _mm_unpackhi_epi8(a221, a222);
+        a225 = (a208 + 2);
+        *(a225) = s56;
+        a226 = (a208 + 3);
+        *(a226) = s57;
+        if ((((unsigned char*)X)[0] > 210)) {
+            __m128i m12, m13;
+            m12 = ((__m128i*)X)[0];
+            m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
+            m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
+            m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
+            __m128i m14;
+            m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
+            m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)),
+                                         ((__m128i)m14)));
+            m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)),
+                                         ((__m128i)m14)));
+            m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)),
+                                         ((__m128i)m14)));
+            m14 = _mm_unpacklo_epi8(m14, m14);
+            m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
+            m13 = _mm_unpacklo_epi64(m14, m14);
+            ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
+            ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
+            ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
+            ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
+        }
      }
-  }
-
-  renormalize(X, 210);
  
-  /*int ch;
-  for(ch = 0; ch < 64; ch++) {
-    printf("%d,", X[ch]);
-  }
-  printf("\n");*/
-
-  unsigned int j;
-  for(j=0; j < (framebits + excess) % 2; ++j) {
-    int i;
-    for(i=0;i<64/2;i++){
-      BFLY(i, (((framebits+excess) >> 1) << 1) + j , syms, Y, X, (decision_t *)dec, Branchtab);
-    }
+    renormalize(X, 210);
  
-
-    renormalize(Y, 210);
-
-    /*printf("\n");
+    /*int ch;
      for(ch = 0; ch < 64; ch++) {
-      printf("%d,", Y[ch]);
+      printf("%d,", X[ch]);
      }
      printf("\n");*/
  
-  }
-  /*skip*/
+    unsigned int j;
+    for (j = 0; j < (framebits + excess) % 2; ++j) {
+        int i;
+        for (i = 0; i < 64 / 2; i++) {
+            BFLY(i,
+                 (((framebits + excess) >> 1) << 1) + j,
+                 syms,
+                 Y,
+                 X,
+                 (decision_t*)dec,
+                 Branchtab);
+        }
+
+
+        renormalize(Y, 210);
+
+        /*printf("\n");
+        for(ch = 0; ch < 64; ch++) {
+          printf("%d,", Y[ch]);
+        }
+        printf("\n");*/
+    }
+    /*skip*/
  }
  
  #endif /*LV_HAVE_SSE3*/
@@ -612,30 +633,32 @@ volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y, unsigned char* X,
  
  #if LV_HAVE_GENERIC
  
-static inline void
-volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y, unsigned char* X,
-                                 unsigned char* syms, unsigned char* dec,
-                                 unsigned int framebits, unsigned int excess,
-                                 unsigned char* Branchtab)
+static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y,
+                                                    unsigned char* X,
+                                                    unsigned char* syms,
+                                                    unsigned char* dec,
+                                                    unsigned int framebits,
+                                                    unsigned int excess,
+                                                    unsigned char* Branchtab)
  {
-  int nbits = framebits + excess;
-  int NUMSTATES = 64;
-  int RENORMALIZE_THRESHOLD = 210;
-
-  int s,i;
-  for (s=0;s<nbits;s++){
-    void *tmp;
-    for(i=0;i<NUMSTATES/2;i++){
-      BFLY(i, s, syms, Y, X, (decision_t *)dec, Branchtab);
+    int nbits = framebits + excess;
+    int NUMSTATES = 64;
+    int RENORMALIZE_THRESHOLD = 210;
+
+    int s, i;
+    for (s = 0; s < nbits; s++) {
+        void* tmp;
+        for (i = 0; i < NUMSTATES / 2; i++) {
+            BFLY(i, s, syms, Y, X, (decision_t*)dec, Branchtab);
+        }
+
+        renormalize(Y, RENORMALIZE_THRESHOLD);
+
+        ///     Swap pointers to old and new metrics
+        tmp = (void*)X;
+        X = Y;
+        Y = (unsigned char*)tmp;
      }
-
-    renormalize(Y, RENORMALIZE_THRESHOLD);
-
-    ///     Swap pointers to old and new metrics
-    tmp = (void *)X;
-    X = Y;
-    Y = (unsigned char*)tmp;
-  }
  }
  
  #endif /* LV_HAVE_GENERIC */
diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h

index 855248846090ccf4a03b391f4cf35e454df5ccfd..51be069468cf6e81d9807f77fc4182718fbed467 100644 (file)
--- a/lib/kernel_tests.h
+++ b/lib/kernel_tests.h
@@ -8,13 +8,18 @@
  
  // for puppets we need to get all the func_variants for the puppet and just
  // keep track of the actual function name to write to results
-#define VOLK_INIT_PUPP(func, puppet_master_func, test_params)\
-    volk_test_case_t(func##_get_func_desc(), (void(*)())func##_manual, std::string(#func),\
-    std::string(#puppet_master_func), test_params)
+#define VOLK_INIT_PUPP(func, puppet_master_func, test_params) \
+    volk_test_case_t(func##_get_func_desc(),                  \
+                     (void (*)())func##_manual,               \
+                     std::string(#func),                      \
+                     std::string(#puppet_master_func),        \
+                     test_params)
  
-#define VOLK_INIT_TEST(func, test_params)\
-    volk_test_case_t(func##_get_func_desc(), (void(*)())func##_manual, std::string(#func),\
-    test_params)
+#define VOLK_INIT_TEST(func, test_params)       \
+    volk_test_case_t(func##_get_func_desc(),    \
+                     (void (*)())func##_manual, \
+                     std::string(#func),        \
+                     test_params)
  
  #define QA(test) test_cases.push_back(test);
  std::vector<volk_test_case_t> init_test_list(volk_test_params_t test_params)
@@ -32,127 +37,135 @@ std::vector<volk_test_case_t> init_test_list(volk_test_params_t test_params)
      test_params_rotator.set_tol(1e-3);
  
      std::vector<volk_test_case_t> test_cases;
-    QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt,     test_params))
-    QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt,     test_params))
-    QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt,     test_params))
+    QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params))
+    QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params))
+    QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params))
      QA(VOLK_INIT_PUPP(volk_16u_byteswappuppet_16u, volk_16u_byteswap, test_params))
      QA(VOLK_INIT_PUPP(volk_32u_byteswappuppet_32u, volk_32u_byteswap, test_params))
-    QA(VOLK_INIT_PUPP(volk_32u_popcntpuppet_32u, volk_32u_popcnt_32u,  test_params))
+    QA(VOLK_INIT_PUPP(volk_32u_popcntpuppet_32u, volk_32u_popcnt_32u, test_params))
      QA(VOLK_INIT_PUPP(volk_64u_byteswappuppet_64u, volk_64u_byteswap, test_params))
-    QA(VOLK_INIT_PUPP(volk_32fc_s32fc_rotatorpuppet_32fc, volk_32fc_s32fc_x2_rotator_32fc, test_params_rotator))
-    QA(VOLK_INIT_PUPP(volk_8u_conv_k7_r2puppet_8u, volk_8u_x4_conv_k7_r2_8u, test_params.make_tol(0)))
-    QA(VOLK_INIT_PUPP(volk_32f_x2_fm_detectpuppet_32f, volk_32f_s32f_32f_fm_detect_32f, test_params))
-    QA(VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_real_32f,           test_params))
-    QA(VOLK_INIT_TEST(volk_16ic_deinterleave_real_8i,                 test_params))
-    QA(VOLK_INIT_TEST(volk_16ic_deinterleave_16i_x2,                  test_params))
-    QA(VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_32f_x2,             test_params))
-    QA(VOLK_INIT_TEST(volk_16ic_deinterleave_real_16i,                test_params))
-    QA(VOLK_INIT_TEST(volk_16ic_magnitude_16i,                        test_params))
-    QA(VOLK_INIT_TEST(volk_16ic_s32f_magnitude_32f,                   test_params))
-    QA(VOLK_INIT_TEST(volk_16ic_convert_32fc,                         test_params))
-    QA(VOLK_INIT_TEST(volk_16ic_x2_multiply_16ic,                     test_params))
-    QA(VOLK_INIT_TEST(volk_16ic_x2_dot_prod_16ic,                     test_params))
-    QA(VOLK_INIT_TEST(volk_16i_s32f_convert_32f,                      test_params))
-    QA(VOLK_INIT_TEST(volk_16i_convert_8i,                            test_params))
-    QA(VOLK_INIT_TEST(volk_16i_32fc_dot_prod_32fc,                    test_params_inacc))
-    QA(VOLK_INIT_TEST(volk_32f_accumulator_s32f,                      test_params_inacc))
-    QA(VOLK_INIT_TEST(volk_32f_x2_add_32f,                            test_params))
-    QA(VOLK_INIT_TEST(volk_32f_index_max_16u,                         test_params))
-    QA(VOLK_INIT_TEST(volk_32f_index_max_32u,                         test_params))
-    QA(VOLK_INIT_TEST(volk_32fc_32f_multiply_32fc,                    test_params))
-    QA(VOLK_INIT_TEST(volk_32fc_32f_add_32fc,                         test_params))
-    QA(VOLK_INIT_TEST(volk_32f_log2_32f,                              test_params.make_absolute(1e-5)))
-    QA(VOLK_INIT_TEST(volk_32f_expfast_32f,                           test_params_inacc_tenth))
-    QA(VOLK_INIT_TEST(volk_32f_x2_pow_32f,                            test_params_inacc))
-    QA(VOLK_INIT_TEST(volk_32f_sin_32f,                               test_params_inacc))
-    QA(VOLK_INIT_TEST(volk_32f_cos_32f,                               test_params_inacc))
-    QA(VOLK_INIT_TEST(volk_32f_tan_32f,                               test_params_inacc))
-    QA(VOLK_INIT_TEST(volk_32f_atan_32f,                              test_params_inacc))
-    QA(VOLK_INIT_TEST(volk_32f_asin_32f,                              test_params_inacc))
-    QA(VOLK_INIT_TEST(volk_32f_acos_32f,                              test_params_inacc))
-    QA(VOLK_INIT_TEST(volk_32fc_s32f_power_32fc,                      test_params_power))
-    QA(VOLK_INIT_TEST(volk_32f_s32f_calc_spectral_noise_floor_32f,    test_params_inacc))
-    QA(VOLK_INIT_TEST(volk_32fc_s32f_atan2_32f,                       test_params))
-    QA(VOLK_INIT_TEST(volk_32fc_x2_conjugate_dot_prod_32fc,           test_params_inacc_tenth))
-    QA(VOLK_INIT_TEST(volk_32fc_deinterleave_32f_x2,                  test_params))
-    QA(VOLK_INIT_TEST(volk_32fc_deinterleave_64f_x2,                  test_params))
-    QA(VOLK_INIT_TEST(volk_32fc_s32f_deinterleave_real_16i,           test_params))
-    QA(VOLK_INIT_TEST(volk_32fc_deinterleave_imag_32f,                test_params))
-    QA(VOLK_INIT_TEST(volk_32fc_deinterleave_real_32f,                test_params))
-    QA(VOLK_INIT_TEST(volk_32fc_deinterleave_real_64f,                test_params))
-    QA(VOLK_INIT_TEST(volk_32fc_x2_dot_prod_32fc,                     test_params_inacc))
-    QA(VOLK_INIT_TEST(volk_32fc_32f_dot_prod_32fc,                    test_params_inacc))
-    QA(VOLK_INIT_TEST(volk_32fc_index_max_16u,                        test_params))
-    QA(VOLK_INIT_TEST(volk_32fc_index_max_32u,                        test_params))
-    QA(VOLK_INIT_TEST(volk_32fc_s32f_magnitude_16i,                   test_params))
-    QA(VOLK_INIT_TEST(volk_32fc_magnitude_32f,                        test_params_inacc_tenth))
-    QA(VOLK_INIT_TEST(volk_32fc_magnitude_squared_32f,                test_params))
-    QA(VOLK_INIT_TEST(volk_32fc_x2_add_32fc,                          test_params))
-    QA(VOLK_INIT_TEST(volk_32fc_x2_multiply_32fc,                     test_params))
-    QA(VOLK_INIT_TEST(volk_32fc_x2_multiply_conjugate_32fc,           test_params))
-    QA(VOLK_INIT_TEST(volk_32fc_x2_divide_32fc,                       test_params))
-    QA(VOLK_INIT_TEST(volk_32fc_conjugate_32fc,                       test_params))
-    QA(VOLK_INIT_TEST(volk_32f_s32f_convert_16i,                      test_params))
-    QA(VOLK_INIT_TEST(volk_32f_s32f_convert_32i,                      test_params))
-    QA(VOLK_INIT_TEST(volk_32f_convert_64f,                           test_params))
-    QA(VOLK_INIT_TEST(volk_32f_s32f_convert_8i,                       test_params))
-    QA(VOLK_INIT_TEST(volk_32fc_convert_16ic,                         test_params))
-    QA(VOLK_INIT_TEST(volk_32fc_s32f_power_spectrum_32f,              test_params))
-    QA(VOLK_INIT_TEST(volk_32fc_x2_square_dist_32f,                   test_params))
-    QA(VOLK_INIT_TEST(volk_32fc_x2_s32f_square_dist_scalar_mult_32f,  test_params))
-    QA(VOLK_INIT_TEST(volk_32f_x2_divide_32f,                         test_params))
-    QA(VOLK_INIT_TEST(volk_32f_x2_dot_prod_32f,                       test_params_inacc))
-    QA(VOLK_INIT_TEST(volk_32f_x2_s32f_interleave_16ic,               test_params))
-    QA(VOLK_INIT_TEST(volk_32f_x2_interleave_32fc,                    test_params))
-    QA(VOLK_INIT_TEST(volk_32f_x2_max_32f,                            test_params))
-    QA(VOLK_INIT_TEST(volk_32f_x2_min_32f,                            test_params))
-    QA(VOLK_INIT_TEST(volk_32f_x2_multiply_32f,                       test_params))
-    QA(VOLK_INIT_TEST(volk_32f_64f_multiply_64f,                      test_params))
-    QA(VOLK_INIT_TEST(volk_32f_64f_add_64f,                           test_params))
-    QA(VOLK_INIT_TEST(volk_32f_s32f_normalize,                        test_params))
-    QA(VOLK_INIT_TEST(volk_32f_s32f_power_32f,                        test_params))
-    QA(VOLK_INIT_TEST(volk_32f_sqrt_32f,                              test_params_inacc))
-    QA(VOLK_INIT_TEST(volk_32f_s32f_stddev_32f,                       test_params_inacc))
-    QA(VOLK_INIT_TEST(volk_32f_stddev_and_mean_32f_x2,                test_params_inacc))
-    QA(VOLK_INIT_TEST(volk_32f_x2_subtract_32f,                       test_params))
-    QA(VOLK_INIT_TEST(volk_32f_x3_sum_of_poly_32f,                    test_params_inacc))
-    QA(VOLK_INIT_TEST(volk_32i_x2_and_32i,                            test_params))
-    QA(VOLK_INIT_TEST(volk_32i_s32f_convert_32f,                      test_params))
-    QA(VOLK_INIT_TEST(volk_32i_x2_or_32i,                             test_params))
-    QA(VOLK_INIT_TEST(volk_32f_x2_dot_prod_16i,                       test_params))
-    QA(VOLK_INIT_TEST(volk_64f_convert_32f,                           test_params))
-    QA(VOLK_INIT_TEST(volk_64f_x2_max_64f,                            test_params))
-    QA(VOLK_INIT_TEST(volk_64f_x2_min_64f,                            test_params))
-    QA(VOLK_INIT_TEST(volk_64f_x2_multiply_64f,                       test_params))
-    QA(VOLK_INIT_TEST(volk_64f_x2_add_64f,                            test_params))
-    QA(VOLK_INIT_TEST(volk_8ic_deinterleave_16i_x2,                   test_params))
-    QA(VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_32f_x2,              test_params))
-    QA(VOLK_INIT_TEST(volk_8ic_deinterleave_real_16i,                 test_params))
-    QA(VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_real_32f,            test_params))
-    QA(VOLK_INIT_TEST(volk_8ic_deinterleave_real_8i,                  test_params))
-    QA(VOLK_INIT_TEST(volk_8ic_x2_multiply_conjugate_16ic,            test_params))
-    QA(VOLK_INIT_TEST(volk_8ic_x2_s32f_multiply_conjugate_32fc,       test_params))
-    QA(VOLK_INIT_TEST(volk_8i_convert_16i,                            test_params))
-    QA(VOLK_INIT_TEST(volk_8i_s32f_convert_32f,                       test_params))
-    QA(VOLK_INIT_TEST(volk_32fc_s32fc_multiply_32fc,                  test_params))
-    QA(VOLK_INIT_TEST(volk_32f_s32f_multiply_32f,                     test_params))
-    QA(VOLK_INIT_TEST(volk_32f_binary_slicer_32i,                     test_params))
-    QA(VOLK_INIT_TEST(volk_32f_binary_slicer_8i,                      test_params))
-    QA(VOLK_INIT_TEST(volk_32u_reverse_32u,                            test_params))
-    QA(VOLK_INIT_TEST(volk_32f_tanh_32f,                              test_params_inacc))
-    QA(VOLK_INIT_TEST(volk_32f_s32f_mod_rangepuppet_32f,              test_params))
+    QA(VOLK_INIT_PUPP(volk_32fc_s32fc_rotatorpuppet_32fc,
+                      volk_32fc_s32fc_x2_rotator_32fc,
+                      test_params_rotator))
+    QA(VOLK_INIT_PUPP(
+        volk_8u_conv_k7_r2puppet_8u, volk_8u_x4_conv_k7_r2_8u, test_params.make_tol(0)))
+    QA(VOLK_INIT_PUPP(
+        volk_32f_x2_fm_detectpuppet_32f, volk_32f_s32f_32f_fm_detect_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_real_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_16ic_deinterleave_real_8i, test_params))
+    QA(VOLK_INIT_TEST(volk_16ic_deinterleave_16i_x2, test_params))
+    QA(VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_32f_x2, test_params))
+    QA(VOLK_INIT_TEST(volk_16ic_deinterleave_real_16i, test_params))
+    QA(VOLK_INIT_TEST(volk_16ic_magnitude_16i, test_params))
+    QA(VOLK_INIT_TEST(volk_16ic_s32f_magnitude_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_16ic_convert_32fc, test_params))
+    QA(VOLK_INIT_TEST(volk_16ic_x2_multiply_16ic, test_params))
+    QA(VOLK_INIT_TEST(volk_16ic_x2_dot_prod_16ic, test_params))
+    QA(VOLK_INIT_TEST(volk_16i_s32f_convert_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_16i_convert_8i, test_params))
+    QA(VOLK_INIT_TEST(volk_16i_32fc_dot_prod_32fc, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_accumulator_s32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_x2_add_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_index_max_16u, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_index_max_32u, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_32f_multiply_32fc, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_32f_add_32fc, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_log2_32f, test_params.make_absolute(1e-5)))
+    QA(VOLK_INIT_TEST(volk_32f_expfast_32f, test_params_inacc_tenth))
+    QA(VOLK_INIT_TEST(volk_32f_x2_pow_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_sin_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_cos_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_tan_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_atan_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_asin_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_acos_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32fc_s32f_power_32fc, test_params_power))
+    QA(VOLK_INIT_TEST(volk_32f_s32f_calc_spectral_noise_floor_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32fc_s32f_atan2_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_x2_conjugate_dot_prod_32fc, test_params_inacc_tenth))
+    QA(VOLK_INIT_TEST(volk_32fc_deinterleave_32f_x2, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_deinterleave_64f_x2, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_s32f_deinterleave_real_16i, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_deinterleave_imag_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_deinterleave_real_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_deinterleave_real_64f, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_x2_dot_prod_32fc, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32fc_32f_dot_prod_32fc, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32fc_index_max_16u, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_index_max_32u, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_s32f_magnitude_16i, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_magnitude_32f, test_params_inacc_tenth))
+    QA(VOLK_INIT_TEST(volk_32fc_magnitude_squared_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_x2_add_32fc, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_x2_multiply_32fc, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_x2_multiply_conjugate_32fc, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_x2_divide_32fc, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_conjugate_32fc, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_s32f_convert_16i, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_s32f_convert_32i, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_convert_64f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_s32f_convert_8i, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_convert_16ic, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_s32f_power_spectrum_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_x2_square_dist_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_x2_divide_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_x2_dot_prod_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_x2_s32f_interleave_16ic, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_x2_interleave_32fc, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_x2_max_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_x2_min_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_x2_multiply_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_64f_multiply_64f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_64f_add_64f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_s32f_normalize, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_s32f_power_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_sqrt_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_s32f_stddev_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_stddev_and_mean_32f_x2, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_x2_subtract_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_x3_sum_of_poly_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32i_x2_and_32i, test_params))
+    QA(VOLK_INIT_TEST(volk_32i_s32f_convert_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32i_x2_or_32i, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_x2_dot_prod_16i, test_params))
+    QA(VOLK_INIT_TEST(volk_64f_convert_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_64f_x2_max_64f, test_params))
+    QA(VOLK_INIT_TEST(volk_64f_x2_min_64f, test_params))
+    QA(VOLK_INIT_TEST(volk_64f_x2_multiply_64f, test_params))
+    QA(VOLK_INIT_TEST(volk_64f_x2_add_64f, test_params))
+    QA(VOLK_INIT_TEST(volk_8ic_deinterleave_16i_x2, test_params))
+    QA(VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_32f_x2, test_params))
+    QA(VOLK_INIT_TEST(volk_8ic_deinterleave_real_16i, test_params))
+    QA(VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_real_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_8ic_deinterleave_real_8i, test_params))
+    QA(VOLK_INIT_TEST(volk_8ic_x2_multiply_conjugate_16ic, test_params))
+    QA(VOLK_INIT_TEST(volk_8ic_x2_s32f_multiply_conjugate_32fc, test_params))
+    QA(VOLK_INIT_TEST(volk_8i_convert_16i, test_params))
+    QA(VOLK_INIT_TEST(volk_8i_s32f_convert_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_s32fc_multiply_32fc, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_s32f_multiply_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_binary_slicer_32i, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_binary_slicer_8i, test_params))
+    QA(VOLK_INIT_TEST(volk_32u_reverse_32u, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_tanh_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_s32f_mod_rangepuppet_32f, test_params))
      QA(VOLK_INIT_TEST(volk_32fc_x2_s32fc_multiply_conjugate_add_32fc, test_params))
-    QA(VOLK_INIT_PUPP(volk_8u_x3_encodepolarpuppet_8u, volk_8u_x3_encodepolar_8u_x2, test_params))
-    QA(VOLK_INIT_PUPP(volk_32f_8u_polarbutterflypuppet_32f, volk_32f_8u_polarbutterfly_32f, test_params))
-    QA(VOLK_INIT_TEST(volk_32f_exp_32f,                               test_params))
-
+    QA(VOLK_INIT_PUPP(
+        volk_8u_x3_encodepolarpuppet_8u, volk_8u_x3_encodepolar_8u_x2, test_params))
+    QA(VOLK_INIT_PUPP(volk_32f_8u_polarbutterflypuppet_32f,
+                      volk_32f_8u_polarbutterfly_32f,
+                      test_params))
      // no one uses these, so don't test them
-    //VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex);
-    //VOLK_PROFILE(volk_16i_branch_4_state_8, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex);
-    //VOLK_PROFILE(volk_16i_max_star_16i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
-    //VOLK_PROFILE(volk_16i_max_star_horizontal_16i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
-    //VOLK_PROFILE(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
-    //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
+    // VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results,
+    // benchmark_mode, kernel_regex); VOLK_PROFILE(volk_16i_branch_4_state_8, 1e-4, 2046,
+    // 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_16i_max_star_16i,
+    // 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
+    // VOLK_PROFILE(volk_16i_max_star_horizontal_16i, 0, 0, 204602, 10000, &results,
+    // benchmark_mode, kernel_regex); VOLK_PROFILE(volk_16i_permute_and_scalar_add, 1e-4,
+    // 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
+    // VOLK_PROFILE(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results,
+    // benchmark_mode, kernel_regex);
      // we need a puppet for this one
      //(VOLK_INIT_TEST(volk_32fc_s32f_x2_power_spectral_density_32f,   test_params))
  
diff --git a/lib/qa_utils.cc b/lib/qa_utils.cc

index 76df0696ae75a3e026c0882411bd99b9a95ee0e5..1dcee6e0b26d88c390cf3e84cf82671652f28978 100644 (file)
--- a/lib/qa_utils.cc
+++ b/lib/qa_utils.cc
@@ -1,79 +1,94 @@
-#include <volk/volk.h>
  #include "qa_utils.h"
+#include <volk/volk.h>
  
-#include <volk/volk.h>                              // for volk_func_desc_t
-#include <volk/volk_malloc.h>                       // for volk_free, volk_m...
+#include <volk/volk.h>        // for volk_func_desc_t
+#include <volk/volk_malloc.h> // for volk_free, volk_m...
  
-#include <assert.h>                                 // for assert
-#include <stdint.h>                                 // for uint16_t, uint64_t
-#include <sys/time.h>                               // for CLOCKS_PER_SEC
-#include <sys/types.h>                              // for int16_t, int32_t
+#include <assert.h>    // for assert
+#include <stdint.h>    // for uint16_t, uint64_t
+#include <sys/time.h>  // for CLOCKS_PER_SEC
+#include <sys/types.h> // for int16_t, int32_t
  #include <chrono>
-#include <cmath>                                    // for sqrt, fabs, abs
-#include <cstring>                                  // for memcpy, memset
-#include <ctime>                                    // for clock
-#include <fstream>                                  // for operator<<, basic...
-#include <iostream>                                 // for cout, cerr
-#include <limits>                                   // for numeric_limits
-#include <map>                                      // for map, map<>::mappe...
+#include <cmath>    // for sqrt, fabs, abs
+#include <cstring>  // for memcpy, memset
+#include <ctime>    // for clock
+#include <fstream>  // for operator<<, basic...
+#include <iostream> // for cout, cerr
+#include <limits>   // for numeric_limits
+#include <map>      // for map, map<>::mappe...
  #include <random>
-#include <vector>                                   // for vector, _Bit_refe...
+#include <vector> // for vector, _Bit_refe...
  
  template <typename T>
-void random_floats(void *buf, unsigned int n, std::default_random_engine& rnd_engine)
+void random_floats(void* buf, unsigned int n, std::default_random_engine& rnd_engine)
  {
-    T *array = static_cast<T*>(buf);
+    T* array = static_cast<T*>(buf);
      std::uniform_real_distribution<T> uniform_dist(T(-1), T(1));
-    for(unsigned int i = 0; i < n; i++) {
+    for (unsigned int i = 0; i < n; i++) {
          array[i] = uniform_dist(rnd_engine);
      }
  }
  
-void load_random_data(void *data, volk_type_t type, unsigned int n) {
+void load_random_data(void* data, volk_type_t type, unsigned int n)
+{
      std::random_device rnd_device;
      std::default_random_engine rnd_engine(rnd_device());
-    if(type.is_complex) n *= 2;
-    if(type.is_float) {
-        if(type.size == 8) {
+    if (type.is_complex)
+        n *= 2;
+    if (type.is_float) {
+        if (type.size == 8) {
              random_floats<double>(data, n, rnd_engine);
          } else {
-            random_floats<float> (data, n, rnd_engine);
+            random_floats<float>(data, n, rnd_engine);
          }
      } else {
-        float int_max = float(uint64_t(2) << (type.size*8));
-        if(type.is_signed) int_max /= 2.0;
+        float int_max = float(uint64_t(2) << (type.size * 8));
+        if (type.is_signed)
+            int_max /= 2.0;
          std::uniform_real_distribution<float> uniform_dist(-int_max, int_max);
-        for(unsigned int i=0; i<n; i++) {
+        for (unsigned int i = 0; i < n; i++) {
              float scaled_rand = uniform_dist(rnd_engine);
-            //man i really don't know how to do this in a more clever way, you have to cast down at some point
-            switch(type.size) {
+            // man i really don't know how to do this in a more clever way, you have to
+            // cast down at some point
+            switch (type.size) {
              case 8:
-                if(type.is_signed) ((int64_t *)data)[i] = (int64_t) scaled_rand;
-                else ((uint64_t *)data)[i] = (uint64_t) scaled_rand;
-            break;
+                if (type.is_signed)
+                    ((int64_t*)data)[i] = (int64_t)scaled_rand;
+                else
+                    ((uint64_t*)data)[i] = (uint64_t)scaled_rand;
+                break;
              case 4:
-                if(type.is_signed) ((int32_t *)data)[i] = (int32_t) scaled_rand;
-                else ((uint32_t *)data)[i] = (uint32_t) scaled_rand;
-            break;
+                if (type.is_signed)
+                    ((int32_t*)data)[i] = (int32_t)scaled_rand;
+                else
+                    ((uint32_t*)data)[i] = (uint32_t)scaled_rand;
+                break;
              case 2:
-                if(type.is_signed) ((int16_t *)data)[i] = (int16_t)((int16_t) scaled_rand % 8);
-                else ((uint16_t *)data)[i] = (uint16_t) ((int16_t) scaled_rand % 8);
-            break;
+                if (type.is_signed)
+                    ((int16_t*)data)[i] = (int16_t)((int16_t)scaled_rand % 8);
+                else
+                    ((uint16_t*)data)[i] = (uint16_t)((int16_t)scaled_rand % 8);
+                break;
              case 1:
-                if(type.is_signed) ((int8_t *)data)[i] = (int8_t) scaled_rand;
-                else ((uint8_t *)data)[i] = (uint8_t) scaled_rand;
-            break;
+                if (type.is_signed)
+                    ((int8_t*)data)[i] = (int8_t)scaled_rand;
+                else
+                    ((uint8_t*)data)[i] = (uint8_t)scaled_rand;
+                break;
              default:
-                throw "load_random_data: no support for data size > 8 or < 1"; //no shenanigans here
+                throw "load_random_data: no support for data size > 8 or < 1"; // no
+                                                                               // shenanigans
+                                                                               // here
              }
          }
      }
  }
  
-static std::vector<std::string> get_arch_list(volk_func_desc_t desc) {
+static std::vector<std::string> get_arch_list(volk_func_desc_t desc)
+{
      std::vector<std::string> archlist;
  
-    for(size_t i = 0; i < desc.n_impls; i++) {
+    for (size_t i = 0; i < desc.n_impls; i++) {
          archlist.push_back(std::string(desc.impl_names[i]));
      }
  
@@ -96,7 +111,8 @@ T volk_lexical_cast(const std::string& str)
      return var;
  }
  
-volk_type_t volk_type_from_string(std::string name) {
+volk_type_t volk_type_from_string(std::string name)
+{
      volk_type_t type;
      type.is_float = false;
      type.is_scalar = false;
@@ -105,28 +121,28 @@ volk_type_t volk_type_from_string(std::string name) {
      type.size = 0;
      type.str = name;
  
-    if(name.size() < 2) {
+    if (name.size() < 2) {
          throw std::string("name too short to be a datatype");
      }
  
-    //is it a scalar?
-    if(name[0] == 's') {
+    // is it a scalar?
+    if (name[0] == 's') {
          type.is_scalar = true;
-        name = name.substr(1, name.size()-1);
+        name = name.substr(1, name.size() - 1);
      }
  
-    //get the data size
+    // get the data size
      size_t last_size_pos = name.find_last_of("0123456789");
-    if(last_size_pos == std::string::npos) {
+    if (last_size_pos == std::string::npos) {
          throw std::string("no size spec in type ").append(name);
      }
-    //will throw if malformed
-    int size = volk_lexical_cast<int>(name.substr(0, last_size_pos+1));
+    // will throw if malformed
+    int size = volk_lexical_cast<int>(name.substr(0, last_size_pos + 1));
  
      assert(((size % 8) == 0) && (size <= 64) && (size != 0));
-    type.size = size/8; //in bytes
+    type.size = size / 8; // in bytes
  
-    for(size_t i=last_size_pos+1; i < name.size(); i++) {
+    for (size_t i = last_size_pos + 1; i < name.size(); i++) {
          switch (name[i]) {
          case 'f':
              type.is_float = true;
@@ -148,7 +164,8 @@ volk_type_t volk_type_from_string(std::string name) {
      return type;
  }
  
-std::vector<std::string> split_signature(const std::string &protokernel_signature) {
+std::vector<std::string> split_signature(const std::string& protokernel_signature)
+{
      std::vector<std::string> signature_tokens;
      std::string token;
      for (unsigned int loc = 0; loc < protokernel_signature.size(); ++loc) {
@@ -165,16 +182,17 @@ std::vector<std::string> split_signature(const std::string &protokernel_signatur
      return signature_tokens;
  }
  
-static void get_signatures_from_name(std::vector<volk_type_t> &inputsig,
-                                   std::vector<volk_type_t> &outputsig,
-                                   std::string name) {
+static void get_signatures_from_name(std::vector<volk_type_t>& inputsig,
+                                     std::vector<volk_type_t>& outputsig,
+                                     std::string name)
+{
  
      std::vector<std::string> toked = split_signature(name);
  
      assert(toked[0] == "volk");
      toked.erase(toked.begin());
  
-    //ok. we're assuming a string in the form
+    // ok. we're assuming a string in the form
      //(sig)_(multiplier-opt)_..._(name)_(sig)_(multiplier-opt)_..._(alignment)
  
      enum { SIDE_INPUT, SIDE_NAME, SIDE_OUTPUT } side = SIDE_INPUT;
@@ -184,106 +202,184 @@ static void get_signatures_from_name(std::vector<volk_type_t> &inputsig,
          std::string token = toked[token_index];
          try {
              type = volk_type_from_string(token);
-            if(side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name...
-
-            if(side == SIDE_INPUT) inputsig.push_back(type);
-            else outputsig.push_back(type);
-        } catch (...){
-            if(token[0] == 'x' && (token.size() > 1) && (token[1] > '0' && token[1] < '9')) { //it's a multiplier
-                if(side == SIDE_INPUT) assert(inputsig.size() > 0);
-                else assert(outputsig.size() > 0);
-                int multiplier = volk_lexical_cast<int>(token.substr(1, token.size()-1)); //will throw if invalid
-                for(int i=1; i<multiplier; i++) {
-                    if(side == SIDE_INPUT) inputsig.push_back(inputsig.back());
-                    else outputsig.push_back(outputsig.back());
+            if (side == SIDE_NAME)
+                side = SIDE_OUTPUT; // if this is the first one after the name...
+
+            if (side == SIDE_INPUT)
+                inputsig.push_back(type);
+            else
+                outputsig.push_back(type);
+        } catch (...) {
+            if (token[0] == 'x' && (token.size() > 1) &&
+                (token[1] > '0' && token[1] < '9')) { // it's a multiplier
+                if (side == SIDE_INPUT)
+                    assert(inputsig.size() > 0);
+                else
+                    assert(outputsig.size() > 0);
+                int multiplier = volk_lexical_cast<int>(
+                    token.substr(1, token.size() - 1)); // will throw if invalid
+                for (int i = 1; i < multiplier; i++) {
+                    if (side == SIDE_INPUT)
+                        inputsig.push_back(inputsig.back());
+                    else
+                        outputsig.push_back(outputsig.back());
                  }
-            }
-            else if(side == SIDE_INPUT) { //it's the function name, at least it better be
+            } else if (side ==
+                       SIDE_INPUT) { // it's the function name, at least it better be
                  side = SIDE_NAME;
                  fn_name.append("_");
                  fn_name.append(token);
-            }
-            else if(side == SIDE_OUTPUT) {
-                if(token != toked.back()) throw; //the last token in the name is the alignment
+            } else if (side == SIDE_OUTPUT) {
+                if (token != toked.back())
+                    throw; // the last token in the name is the alignment
              }
          }
      }
-    //we don't need an output signature (some fn's operate on the input data, "in place"), but we do need at least one input!
+    // we don't need an output signature (some fn's operate on the input data, "in
+    // place"), but we do need at least one input!
      assert(inputsig.size() != 0);
-
  }
  
-inline void run_cast_test1(volk_fn_1arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
-    while(iter--) func(buffs[0], vlen, arch.c_str());
+inline void run_cast_test1(volk_fn_1arg func,
+                           std::vector<void*>& buffs,
+                           unsigned int vlen,
+                           unsigned int iter,
+                           std::string arch)
+{
+    while (iter--)
+        func(buffs[0], vlen, arch.c_str());
  }
  
-inline void run_cast_test2(volk_fn_2arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
-    while(iter--) func(buffs[0], buffs[1], vlen, arch.c_str());
+inline void run_cast_test2(volk_fn_2arg func,
+                           std::vector<void*>& buffs,
+                           unsigned int vlen,
+                           unsigned int iter,
+                           std::string arch)
+{
+    while (iter--)
+        func(buffs[0], buffs[1], vlen, arch.c_str());
  }
  
-inline void run_cast_test3(volk_fn_3arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
-    while(iter--) func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str());
+inline void run_cast_test3(volk_fn_3arg func,
+                           std::vector<void*>& buffs,
+                           unsigned int vlen,
+                           unsigned int iter,
+                           std::string arch)
+{
+    while (iter--)
+        func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str());
  }
  
-inline void run_cast_test4(volk_fn_4arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
-    while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str());
+inline void run_cast_test4(volk_fn_4arg func,
+                           std::vector<void*>& buffs,
+                           unsigned int vlen,
+                           unsigned int iter,
+                           std::string arch)
+{
+    while (iter--)
+        func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str());
  }
  
-inline void run_cast_test1_s32f(volk_fn_1arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
-    while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
+inline void run_cast_test1_s32f(volk_fn_1arg_s32f func,
+                                std::vector<void*>& buffs,
+                                float scalar,
+                                unsigned int vlen,
+                                unsigned int iter,
+                                std::string arch)
+{
+    while (iter--)
+        func(buffs[0], scalar, vlen, arch.c_str());
  }
  
-inline void run_cast_test2_s32f(volk_fn_2arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
-    while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
+inline void run_cast_test2_s32f(volk_fn_2arg_s32f func,
+                                std::vector<void*>& buffs,
+                                float scalar,
+                                unsigned int vlen,
+                                unsigned int iter,
+                                std::string arch)
+{
+    while (iter--)
+        func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
  }
  
-inline void run_cast_test3_s32f(volk_fn_3arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
-    while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+inline void run_cast_test3_s32f(volk_fn_3arg_s32f func,
+                                std::vector<void*>& buffs,
+                                float scalar,
+                                unsigned int vlen,
+                                unsigned int iter,
+                                std::string arch)
+{
+    while (iter--)
+        func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
  }
  
-inline void run_cast_test1_s32fc(volk_fn_1arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
-    while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
+inline void run_cast_test1_s32fc(volk_fn_1arg_s32fc func,
+                                 std::vector<void*>& buffs,
+                                 lv_32fc_t scalar,
+                                 unsigned int vlen,
+                                 unsigned int iter,
+                                 std::string arch)
+{
+    while (iter--)
+        func(buffs[0], scalar, vlen, arch.c_str());
  }
  
-inline void run_cast_test2_s32fc(volk_fn_2arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
-    while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
+inline void run_cast_test2_s32fc(volk_fn_2arg_s32fc func,
+                                 std::vector<void*>& buffs,
+                                 lv_32fc_t scalar,
+                                 unsigned int vlen,
+                                 unsigned int iter,
+                                 std::string arch)
+{
+    while (iter--)
+        func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
  }
  
-inline void run_cast_test3_s32fc(volk_fn_3arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
-    while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+inline void run_cast_test3_s32fc(volk_fn_3arg_s32fc func,
+                                 std::vector<void*>& buffs,
+                                 lv_32fc_t scalar,
+                                 unsigned int vlen,
+                                 unsigned int iter,
+                                 std::string arch)
+{
+    while (iter--)
+        func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
  }
  
  template <class t>
-bool fcompare(t *in1, t *in2, unsigned int vlen, float tol, bool absolute_mode) {
+bool fcompare(t* in1, t* in2, unsigned int vlen, float tol, bool absolute_mode)
+{
      bool fail = false;
      int print_max_errs = 10;
-    for(unsigned int i=0; i<vlen; i++) {
+    for (unsigned int i = 0; i < vlen; i++) {
          if (absolute_mode) {
-            if (fabs(((t *)(in1))[i] - ((t *)(in2))[i]) > tol) {
-                fail=true;
-                if(print_max_errs-- > 0) {
-                    std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]);
+            if (fabs(((t*)(in1))[i] - ((t*)(in2))[i]) > tol) {
+                fail = true;
+                if (print_max_errs-- > 0) {
+                    std::cout << "offset " << i << " in1: " << t(((t*)(in1))[i])
+                              << " in2: " << t(((t*)(in2))[i]);
                      std::cout << " tolerance was: " << tol << std::endl;
                  }
              }
          } else {
              // for very small numbers we'll see round off errors due to limited
              // precision. So a special test case...
-            if(fabs(((t *)(in1))[i]) < 1e-30) {
-                if( fabs( ((t *)(in2))[i] ) > tol )
-                {
-                    fail=true;
-                    if(print_max_errs-- > 0) {
-                    std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]);
+            if (fabs(((t*)(in1))[i]) < 1e-30) {
+                if (fabs(((t*)(in2))[i]) > tol) {
+                    fail = true;
+                    if (print_max_errs-- > 0) {
+                        std::cout << "offset " << i << " in1: " << t(((t*)(in1))[i])
+                                  << " in2: " << t(((t*)(in2))[i]);
                          std::cout << " tolerance was: " << tol << std::endl;
                      }
                  }
              }
              // the primary test is the percent different greater than given tol
-            else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/fabs(((t *)in1)[i]) > tol) {
-                fail=true;
-                if(print_max_errs-- > 0) {
-                    std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]);
+            else if (fabs(((t*)(in1))[i] - ((t*)(in2))[i]) / fabs(((t*)in1)[i]) > tol) {
+                fail = true;
+                if (print_max_errs-- > 0) {
+                    std::cout << "offset " << i << " in1: " << t(((t*)(in1))[i])
+                              << " in2: " << t(((t*)(in2))[i]);
                      std::cout << " tolerance was: " << tol << std::endl;
                  }
              }
@@ -294,43 +390,50 @@ bool fcompare(t *in1, t *in2, unsigned int vlen, float tol, bool absolute_mode)
  }
  
  template <class t>
-bool ccompare(t *in1, t *in2, unsigned int vlen, float tol, bool absolute_mode) {
+bool ccompare(t* in1, t* in2, unsigned int vlen, float tol, bool absolute_mode)
+{
      if (absolute_mode) {
-      std::cout << "ccompare does not support absolute mode" << std::endl;
-      return true;
+        std::cout << "ccompare does not support absolute mode" << std::endl;
+        return true;
      }
      bool fail = false;
      int print_max_errs = 10;
-    for(unsigned int i=0; i<2*vlen; i+=2) {
-        if (std::isnan(in1[i]) || std::isnan(in1[i+1]) || std::isnan(in2[i]) || std::isnan(in2[i+1])
-                || std::isinf(in1[i]) || std::isinf(in1[i+1]) || std::isinf(in2[i]) || std::isinf(in2[i+1])) {
-            fail=true;
-            if(print_max_errs-- > 0) {
-                std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j  in2: " << in2[i] << " + " << in2[i+1] << "j";
+    for (unsigned int i = 0; i < 2 * vlen; i += 2) {
+        if (std::isnan(in1[i]) || std::isnan(in1[i + 1]) || std::isnan(in2[i]) ||
+            std::isnan(in2[i + 1]) || std::isinf(in1[i]) || std::isinf(in1[i + 1]) ||
+            std::isinf(in2[i]) || std::isinf(in2[i + 1])) {
+            fail = true;
+            if (print_max_errs-- > 0) {
+                std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + "
+                          << in1[i + 1] << "j  in2: " << in2[i] << " + " << in2[i + 1]
+                          << "j";
                  std::cout << " tolerance was: " << tol << std::endl;
              }
          }
-        t diff[2] = { in1[i] - in2[i], in1[i+1] - in2[i+1] };
-        t err  = std::sqrt(diff[0] * diff[0] + diff[1] * diff[1]);
-        t norm = std::sqrt(in1[i] * in1[i] + in1[i+1] * in1[i+1]);
+        t diff[2] = { in1[i] - in2[i], in1[i + 1] - in2[i + 1] };
+        t err = std::sqrt(diff[0] * diff[0] + diff[1] * diff[1]);
+        t norm = std::sqrt(in1[i] * in1[i] + in1[i + 1] * in1[i + 1]);
  
          // for very small numbers we'll see round off errors due to limited
          // precision. So a special test case...
          if (norm < 1e-30) {
-            if (err > tol)
-            {
-                fail=true;
-                if(print_max_errs-- > 0) {
-                    std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j  in2: " << in2[i] << " + " << in2[i+1] << "j";
+            if (err > tol) {
+                fail = true;
+                if (print_max_errs-- > 0) {
+                    std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + "
+                              << in1[i + 1] << "j  in2: " << in2[i] << " + " << in2[i + 1]
+                              << "j";
                      std::cout << " tolerance was: " << tol << std::endl;
                  }
              }
          }
          // the primary test is the percent different greater than given tol
-        else if((err / norm) > tol) {
-            fail=true;
-            if(print_max_errs-- > 0) {
-                std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j  in2: " << in2[i] << " + " << in2[i+1] << "j";
+        else if ((err / norm) > tol) {
+            fail = true;
+            if (print_max_errs-- > 0) {
+                std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + "
+                          << in1[i + 1] << "j  in2: " << in2[i] << " + " << in2[i + 1]
+                          << "j";
                  std::cout << " tolerance was: " << tol << std::endl;
              }
          }
@@ -340,18 +443,21 @@ bool ccompare(t *in1, t *in2, unsigned int vlen, float tol, bool absolute_mode)
  }
  
  template <class t>
-bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol, bool absolute_mode) {
+bool icompare(t* in1, t* in2, unsigned int vlen, unsigned int tol, bool absolute_mode)
+{
      if (absolute_mode) {
-      std::cout << "icompare does not support absolute mode" << std::endl;
-      return true;
+        std::cout << "icompare does not support absolute mode" << std::endl;
+        return true;
      }
      bool fail = false;
      int print_max_errs = 10;
-    for(unsigned int i=0; i<vlen; i++) {
-      if(((unsigned int)abs(int(((t *)(in1))[i]) - int(((t *)(in2))[i]))) > tol) {
-            fail=true;
-            if(print_max_errs-- > 0) {
-                std::cout << "offset " << i << " in1: " << static_cast<int>(t(((t *)(in1))[i])) << " in2: " << static_cast<int>(t(((t *)(in2))[i]));
+    for (unsigned int i = 0; i < vlen; i++) {
+        if (((unsigned int)abs(int(((t*)(in1))[i]) - int(((t*)(in2))[i]))) > tol) {
+            fail = true;
+            if (print_max_errs-- > 0) {
+                std::cout << "offset " << i
+                          << " in1: " << static_cast<int>(t(((t*)(in1))[i]))
+                          << " in2: " << static_cast<int>(t(((t*)(in2))[i]));
                  std::cout << " tolerance was: " << tol << std::endl;
              }
          }
@@ -360,34 +466,46 @@ bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol, bool absolute
      return fail;
  }
  
-class volk_qa_aligned_mem_pool{
+class volk_qa_aligned_mem_pool
+{
  public:
-    void *get_new(size_t size){
+    void* get_new(size_t size)
+    {
          size_t alignment = volk_get_alignment();
          void* ptr = volk_malloc(size, alignment);
          memset(ptr, 0x00, size);
          _mems.push_back(ptr);
          return ptr;
      }
-    ~volk_qa_aligned_mem_pool() {
-        for(unsigned int ii = 0; ii < _mems.size(); ++ii) {
+    ~volk_qa_aligned_mem_pool()
+    {
+        for (unsigned int ii = 0; ii < _mems.size(); ++ii) {
              volk_free(_mems[ii]);
          }
      }
-private: std::vector<void * > _mems;
+
+private:
+    std::vector<void*> _mems;
  };
  
  bool run_volk_tests(volk_func_desc_t desc,
                      void (*manual_func)(),
                      std::string name,
                      volk_test_params_t test_params,
-                    std::vector<volk_test_results_t> *results,
-                    std::string puppet_master_name
-)
+                    std::vector<volk_test_results_t>* results,
+                    std::string puppet_master_name)
  {
-    return run_volk_tests(desc, manual_func, name, test_params.tol(), test_params.scalar(),
-        test_params.vlen(), test_params.iter(), results, puppet_master_name,
-        test_params.absolute_mode(), test_params.benchmark_mode());
+    return run_volk_tests(desc,
+                          manual_func,
+                          name,
+                          test_params.tol(),
+                          test_params.scalar(),
+                          test_params.vlen(),
+                          test_params.iter(),
+                          results,
+                          puppet_master_name,
+                          test_params.absolute_mode(),
+                          test_params.benchmark_mode());
  }
  
  bool run_volk_tests(volk_func_desc_t desc,
@@ -397,17 +515,18 @@ bool run_volk_tests(volk_func_desc_t desc,
                      lv_32fc_t scalar,
                      unsigned int vlen,
                      unsigned int iter,
-                    std::vector<volk_test_results_t> *results,
+                    std::vector<volk_test_results_t>* results,
                      std::string puppet_master_name,
                      bool absolute_mode,
-                    bool benchmark_mode
-) {
+                    bool benchmark_mode)
+{
      // Initialize this entry in results vector
      results->push_back(volk_test_results_t());
      results->back().name = name;
      results->back().vlen = vlen;
      results->back().iter = iter;
-    std::cout << "RUN_VOLK_TESTS: " << name << "(" << vlen << "," << iter << ")" << std::endl;
+    std::cout << "RUN_VOLK_TESTS: " << name << "(" << vlen << "," << iter << ")"
+              << std::endl;
  
      // vlen_twiddle will increase vlen for malloc and data generation
      // but kernels will still be called with the user provided vlen.
@@ -418,57 +537,64 @@ bool run_volk_tests(volk_func_desc_t desc,
      const float tol_f = tol;
      const unsigned int tol_i = static_cast<const unsigned int>(tol);
  
-    //first let's get a list of available architectures for the test
+    // first let's get a list of available architectures for the test
      std::vector<std::string> arch_list = get_arch_list(desc);
  
-    if((!benchmark_mode) && (arch_list.size() < 2)) {
+    if ((!benchmark_mode) && (arch_list.size() < 2)) {
          std::cout << "no architectures to test" << std::endl;
          return false;
      }
  
-    //something that can hang onto memory and cleanup when this function exits
+    // something that can hang onto memory and cleanup when this function exits
      volk_qa_aligned_mem_pool mem_pool;
  
-    //now we have to get a function signature by parsing the name
+    // now we have to get a function signature by parsing the name
      std::vector<volk_type_t> inputsig, outputsig;
      try {
          get_signatures_from_name(inputsig, outputsig, name);
-    }
-    catch (std::exception &error) {
-        std::cerr << "Error: unable to get function signature from kernel name" << std::endl;
+    } catch (std::exception& error) {
+        std::cerr << "Error: unable to get function signature from kernel name"
+                  << std::endl;
          std::cerr << "  - " << name << std::endl;
          return false;
      }
  
-    //pull the input scalars into their own vector
+    // pull the input scalars into their own vector
      std::vector<volk_type_t> inputsc;
-    for(size_t i=0; i<inputsig.size(); i++) {
-        if(inputsig[i].is_scalar) {
+    for (size_t i = 0; i < inputsig.size(); i++) {
+        if (inputsig[i].is_scalar) {
              inputsc.push_back(inputsig[i]);
              inputsig.erase(inputsig.begin() + i);
              i -= 1;
          }
      }
-    std::vector<void *> inbuffs;
-    for (unsigned int inputsig_index = 0; inputsig_index < inputsig.size(); ++ inputsig_index) {
+    std::vector<void*> inbuffs;
+    for (unsigned int inputsig_index = 0; inputsig_index < inputsig.size();
+         ++inputsig_index) {
          volk_type_t sig = inputsig[inputsig_index];
-        if(!sig.is_scalar) //we don't make buffers for scalars
-          inbuffs.push_back(mem_pool.get_new(vlen*sig.size*(sig.is_complex ? 2 : 1)));
+        if (!sig.is_scalar) // we don't make buffers for scalars
+            inbuffs.push_back(
+                mem_pool.get_new(vlen * sig.size * (sig.is_complex ? 2 : 1)));
      }
-    for(size_t i=0; i<inbuffs.size(); i++) {
+    for (size_t i = 0; i < inbuffs.size(); i++) {
          load_random_data(inbuffs[i], inputsig[i], vlen);
      }
  
-    //ok let's make a vector of vector of void buffers, which holds the input/output vectors for each arch
-    std::vector<std::vector<void *> > test_data;
-    for(size_t i=0; i<arch_list.size(); i++) {
-        std::vector<void *> arch_buffs;
-        for(size_t j=0; j<outputsig.size(); j++) {
-            arch_buffs.push_back(mem_pool.get_new(vlen*outputsig[j].size*(outputsig[j].is_complex ? 2 : 1)));
+    // ok let's make a vector of vector of void buffers, which holds the input/output
+    // vectors for each arch
+    std::vector<std::vector<void*>> test_data;
+    for (size_t i = 0; i < arch_list.size(); i++) {
+        std::vector<void*> arch_buffs;
+        for (size_t j = 0; j < outputsig.size(); j++) {
+            arch_buffs.push_back(mem_pool.get_new(vlen * outputsig[j].size *
+                                                  (outputsig[j].is_complex ? 2 : 1)));
          }
-        for(size_t j=0; j<inputsig.size(); j++) {
-            void *arch_inbuff = mem_pool.get_new(vlen*inputsig[j].size*(inputsig[j].is_complex ? 2 : 1));
-            memcpy(arch_inbuff, inbuffs[j], vlen * inputsig[j].size * (inputsig[j].is_complex ? 2 : 1));
+        for (size_t j = 0; j < inputsig.size(); j++) {
+            void* arch_inbuff = mem_pool.get_new(vlen * inputsig[j].size *
+                                                 (inputsig[j].is_complex ? 2 : 1));
+            memcpy(arch_inbuff,
+                   inbuffs[j],
+                   vlen * inputsig[j].size * (inputsig[j].is_complex ? 2 : 1));
              arch_buffs.push_back(arch_inbuff);
          }
          test_data.push_back(arch_buffs);
@@ -478,53 +604,90 @@ bool run_volk_tests(volk_func_desc_t desc,
      both_sigs.insert(both_sigs.end(), outputsig.begin(), outputsig.end());
      both_sigs.insert(both_sigs.end(), inputsig.begin(), inputsig.end());
  
-    //now run the test
+    // now run the test
      vlen = vlen - vlen_twiddle;
      std::chrono::time_point<std::chrono::system_clock> start, end;
      std::vector<double> profile_times;
-    for(size_t i = 0; i < arch_list.size(); i++) {
+    for (size_t i = 0; i < arch_list.size(); i++) {
          start = std::chrono::system_clock::now();
  
-        switch(both_sigs.size()) {
-            case 1:
-                if(inputsc.size() == 0) {
-                    run_cast_test1((volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
-                } else if(inputsc.size() == 1 && inputsc[0].is_float) {
-                    if(inputsc[0].is_complex) {
-                        run_cast_test1_s32fc((volk_fn_1arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
-                    } else {
-                        run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
-                    }
-                } else throw "unsupported 1 arg function >1 scalars";
-                break;
-            case 2:
-                if(inputsc.size() == 0) {
-                    run_cast_test2((volk_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
-                } else if(inputsc.size() == 1 && inputsc[0].is_float) {
-                    if(inputsc[0].is_complex) {
-                        run_cast_test2_s32fc((volk_fn_2arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
-                    } else {
-                        run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
-                    }
-                } else throw "unsupported 2 arg function >1 scalars";
-                break;
-            case 3:
-                if(inputsc.size() == 0) {
-                    run_cast_test3((volk_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
-                } else if(inputsc.size() == 1 && inputsc[0].is_float) {
-                    if(inputsc[0].is_complex) {
-                        run_cast_test3_s32fc((volk_fn_3arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
-                    } else {
-                        run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
-                    }
-                } else throw "unsupported 3 arg function >1 scalars";
-                break;
-            case 4:
-                run_cast_test4((volk_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
-                break;
-            default:
-                throw "no function handler for this signature";
-                break;
+        switch (both_sigs.size()) {
+        case 1:
+            if (inputsc.size() == 0) {
+                run_cast_test1(
+                    (volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+            } else if (inputsc.size() == 1 && inputsc[0].is_float) {
+                if (inputsc[0].is_complex) {
+                    run_cast_test1_s32fc((volk_fn_1arg_s32fc)(manual_func),
+                                         test_data[i],
+                                         scalar,
+                                         vlen,
+                                         iter,
+                                         arch_list[i]);
+                } else {
+                    run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func),
+                                        test_data[i],
+                                        scalar.real(),
+                                        vlen,
+                                        iter,
+                                        arch_list[i]);
+                }
+            } else
+                throw "unsupported 1 arg function >1 scalars";
+            break;
+        case 2:
+            if (inputsc.size() == 0) {
+                run_cast_test2(
+                    (volk_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+            } else if (inputsc.size() == 1 && inputsc[0].is_float) {
+                if (inputsc[0].is_complex) {
+                    run_cast_test2_s32fc((volk_fn_2arg_s32fc)(manual_func),
+                                         test_data[i],
+                                         scalar,
+                                         vlen,
+                                         iter,
+                                         arch_list[i]);
+                } else {
+                    run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func),
+                                        test_data[i],
+                                        scalar.real(),
+                                        vlen,
+                                        iter,
+                                        arch_list[i]);
+                }
+            } else
+                throw "unsupported 2 arg function >1 scalars";
+            break;
+        case 3:
+            if (inputsc.size() == 0) {
+                run_cast_test3(
+                    (volk_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+            } else if (inputsc.size() == 1 && inputsc[0].is_float) {
+                if (inputsc[0].is_complex) {
+                    run_cast_test3_s32fc((volk_fn_3arg_s32fc)(manual_func),
+                                         test_data[i],
+                                         scalar,
+                                         vlen,
+                                         iter,
+                                         arch_list[i]);
+                } else {
+                    run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func),
+                                        test_data[i],
+                                        scalar.real(),
+                                        vlen,
+                                        iter,
+                                        arch_list[i]);
+                }
+            } else
+                throw "unsupported 3 arg function >1 scalars";
+            break;
+        case 4:
+            run_cast_test4(
+                (volk_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+            break;
+        default:
+            throw "no function handler for this signature";
+            break;
          }
  
          end = std::chrono::system_clock::now();
@@ -541,10 +704,10 @@ bool run_volk_tests(volk_func_desc_t desc,
          profile_times.push_back(arch_time);
      }
  
-    //and now compare each output to the generic output
-    //first we have to know which output is the generic one, they aren't in order...
-    size_t generic_offset=0;
-    for(size_t i=0; i<arch_list.size(); i++) {
+    // and now compare each output to the generic output
+    // first we have to know which output is the generic one, they aren't in order...
+    size_t generic_offset = 0;
+    for (size_t i = 0; i < arch_list.size(); i++) {
          if (arch_list[i] == "generic") {
              generic_offset = i;
          }
@@ -555,72 +718,126 @@ bool run_volk_tests(volk_func_desc_t desc,
      bool fail;
      bool fail_global = false;
      std::vector<bool> arch_results;
-    for(size_t i=0; i<arch_list.size(); i++) {
+    for (size_t i = 0; i < arch_list.size(); i++) {
          fail = false;
-        if(i != generic_offset) {
-            for(size_t j=0; j<both_sigs.size(); j++) {
-                if(both_sigs[j].is_float) {
-                    if(both_sigs[j].size == 8) {
+        if (i != generic_offset) {
+            for (size_t j = 0; j < both_sigs.size(); j++) {
+                if (both_sigs[j].is_float) {
+                    if (both_sigs[j].size == 8) {
                          if (both_sigs[j].is_complex) {
-                            fail = ccompare((double *) test_data[generic_offset][j], (double *) test_data[i][j], vlen, tol_f, absolute_mode);
+                            fail = ccompare((double*)test_data[generic_offset][j],
+                                            (double*)test_data[i][j],
+                                            vlen,
+                                            tol_f,
+                                            absolute_mode);
                          } else {
-                            fail = fcompare((double *) test_data[generic_offset][j], (double *) test_data[i][j], vlen, tol_f, absolute_mode);
+                            fail = fcompare((double*)test_data[generic_offset][j],
+                                            (double*)test_data[i][j],
+                                            vlen,
+                                            tol_f,
+                                            absolute_mode);
                          }
                      } else {
                          if (both_sigs[j].is_complex) {
-                            fail = ccompare((float *) test_data[generic_offset][j], (float *) test_data[i][j], vlen, tol_f, absolute_mode);
+                            fail = ccompare((float*)test_data[generic_offset][j],
+                                            (float*)test_data[i][j],
+                                            vlen,
+                                            tol_f,
+                                            absolute_mode);
                          } else {
-                            fail = fcompare((float *) test_data[generic_offset][j], (float *) test_data[i][j], vlen, tol_f, absolute_mode);
+                            fail = fcompare((float*)test_data[generic_offset][j],
+                                            (float*)test_data[i][j],
+                                            vlen,
+                                            tol_f,
+                                            absolute_mode);
                          }
                      }
                  } else {
-                    //i could replace this whole switch statement with a memcmp if i wasn't interested in printing the outputs where they differ
-                    switch(both_sigs[j].size) {
+                    // i could replace this whole switch statement with a memcmp if i
+                    // wasn't interested in printing the outputs where they differ
+                    switch (both_sigs[j].size) {
                      case 8:
-                        if(both_sigs[j].is_signed) {
-                            fail = icompare((int64_t *) test_data[generic_offset][j], (int64_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
+                        if (both_sigs[j].is_signed) {
+                            fail = icompare((int64_t*)test_data[generic_offset][j],
+                                            (int64_t*)test_data[i][j],
+                                            vlen * (both_sigs[j].is_complex ? 2 : 1),
+                                            tol_i,
+                                            absolute_mode);
                          } else {
-                            fail = icompare((uint64_t *) test_data[generic_offset][j], (uint64_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
+                            fail = icompare((uint64_t*)test_data[generic_offset][j],
+                                            (uint64_t*)test_data[i][j],
+                                            vlen * (both_sigs[j].is_complex ? 2 : 1),
+                                            tol_i,
+                                            absolute_mode);
                          }
                          break;
                      case 4:
-                        if(both_sigs[j].is_complex) {
-                            if(both_sigs[j].is_signed) {
-                                fail = icompare((int16_t *) test_data[generic_offset][j], (int16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
+                        if (both_sigs[j].is_complex) {
+                            if (both_sigs[j].is_signed) {
+                                fail = icompare((int16_t*)test_data[generic_offset][j],
+                                                (int16_t*)test_data[i][j],
+                                                vlen * (both_sigs[j].is_complex ? 2 : 1),
+                                                tol_i,
+                                                absolute_mode);
                              } else {
-                                fail = icompare((uint16_t *) test_data[generic_offset][j], (uint16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
+                                fail = icompare((uint16_t*)test_data[generic_offset][j],
+                                                (uint16_t*)test_data[i][j],
+                                                vlen * (both_sigs[j].is_complex ? 2 : 1),
+                                                tol_i,
+                                                absolute_mode);
                              }
-                        }
-                        else {
+                        } else {
                              if (both_sigs[j].is_signed) {
-                                fail = icompare((int32_t *) test_data[generic_offset][j], (int32_t *) test_data[i][j],
-                                                vlen * (both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
+                                fail = icompare((int32_t*)test_data[generic_offset][j],
+                                                (int32_t*)test_data[i][j],
+                                                vlen * (both_sigs[j].is_complex ? 2 : 1),
+                                                tol_i,
+                                                absolute_mode);
                              } else {
-                                fail = icompare((uint32_t *) test_data[generic_offset][j], (uint32_t *) test_data[i][j],
-                                                vlen * (both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
+                                fail = icompare((uint32_t*)test_data[generic_offset][j],
+                                                (uint32_t*)test_data[i][j],
+                                                vlen * (both_sigs[j].is_complex ? 2 : 1),
+                                                tol_i,
+                                                absolute_mode);
                              }
                          }
                          break;
                      case 2:
-                        if(both_sigs[j].is_signed) {
-                            fail = icompare((int16_t *) test_data[generic_offset][j], (int16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
+                        if (both_sigs[j].is_signed) {
+                            fail = icompare((int16_t*)test_data[generic_offset][j],
+                                            (int16_t*)test_data[i][j],
+                                            vlen * (both_sigs[j].is_complex ? 2 : 1),
+                                            tol_i,
+                                            absolute_mode);
                          } else {
-                            fail = icompare((uint16_t *) test_data[generic_offset][j], (uint16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
+                            fail = icompare((uint16_t*)test_data[generic_offset][j],
+                                            (uint16_t*)test_data[i][j],
+                                            vlen * (both_sigs[j].is_complex ? 2 : 1),
+                                            tol_i,
+                                            absolute_mode);
                          }
                          break;
                      case 1:
-                        if(both_sigs[j].is_signed) {
-                            fail = icompare((int8_t *) test_data[generic_offset][j], (int8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
+                        if (both_sigs[j].is_signed) {
+                            fail = icompare((int8_t*)test_data[generic_offset][j],
+                                            (int8_t*)test_data[i][j],
+                                            vlen * (both_sigs[j].is_complex ? 2 : 1),
+                                            tol_i,
+                                            absolute_mode);
                          } else {
-                            fail = icompare((uint8_t *) test_data[generic_offset][j], (uint8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
+                            fail = icompare((uint8_t*)test_data[generic_offset][j],
+                                            (uint8_t*)test_data[i][j],
+                                            vlen * (both_sigs[j].is_complex ? 2 : 1),
+                                            tol_i,
+                                            absolute_mode);
                          }
                          break;
                      default:
-                        fail=1;
+                        fail = 1;
                      }
                  }
-                if(fail) {
-                    volk_test_time_t *result = &results->back().results[arch_list[i]];
+                if (fail) {
+                    volk_test_time_t* result = &results->back().results[arch_list[i]];
                      result->pass = false;
                      fail_global = true;
                      std::cout << name << ": fail on arch " << arch_list[i] << std::endl;
@@ -634,15 +851,13 @@ bool run_volk_tests(volk_func_desc_t desc,
      double best_time_u = std::numeric_limits<double>::max();
      std::string best_arch_a = "generic";
      std::string best_arch_u = "generic";
-    for(size_t i=0; i < arch_list.size(); i++)
-    {
-        if((profile_times[i] < best_time_u) && arch_results[i] && desc.impl_alignment[i] == 0)
-        {
+    for (size_t i = 0; i < arch_list.size(); i++) {
+        if ((profile_times[i] < best_time_u) && arch_results[i] &&
+            desc.impl_alignment[i] == 0) {
              best_time_u = profile_times[i];
              best_arch_u = arch_list[i];
          }
-        if((profile_times[i] < best_time_a) && arch_results[i])
-        {
+        if ((profile_times[i] < best_time_a) && arch_results[i]) {
              best_time_a = profile_times[i];
              best_arch_a = arch_list[i];
          }
@@ -651,7 +866,7 @@ bool run_volk_tests(volk_func_desc_t desc,
      std::cout << "Best aligned arch: " << best_arch_a << std::endl;
      std::cout << "Best unaligned arch: " << best_arch_u << std::endl;
  
-    if(puppet_master_name == "NULL") {
+    if (puppet_master_name == "NULL") {
          results->back().config_name = name;
      } else {
          results->back().config_name = puppet_master_name;
diff --git a/lib/qa_utils.h b/lib/qa_utils.h

index 2d8458b915e046e3eb2d42003c734a6d07859640..74c3db4b04aebf4a96b94b855f4775509d3a1ab3 100644 (file)
--- a/lib/qa_utils.h
+++ b/lib/qa_utils.h
@@ -1,14 +1,14 @@
  #ifndef VOLK_QA_UTILS_H
  #define VOLK_QA_UTILS_H
  
-#include <stdbool.h>            // for bool, false
-#include <volk/volk.h>          // for volk_func_desc_t
-#include <cstdlib>              // for NULL
-#include <map>                  // for map
-#include <string>               // for string, basic_string
-#include <vector>               // for vector
+#include <stdbool.h>   // for bool, false
+#include <volk/volk.h> // for volk_func_desc_t
+#include <cstdlib>     // for NULL
+#include <map>         // for map
+#include <string>      // for string, basic_string
+#include <vector>      // for vector
  
-#include "volk/volk_complex.h"  // for lv_32fc_t
+#include "volk/volk_complex.h" // for lv_32fc_t
  
  /************************************************
   * VOLK QA type definitions                     *
@@ -22,93 +22,119 @@ struct volk_type_t {
      std::string str;
  };
  
-class volk_test_time_t {
-    public:
-        std::string name;
-        double time;
-        std::string units;
-        bool pass;
+class volk_test_time_t
+{
+public:
+    std::string name;
+    double time;
+    std::string units;
+    bool pass;
  };
  
-class volk_test_results_t {
-    public:
-        std::string name;
-        std::string config_name;
-        unsigned int vlen;
-        unsigned int iter;
-        std::map<std::string, volk_test_time_t> results;
-        std::string best_arch_a;
-        std::string best_arch_u;
+class volk_test_results_t
+{
+public:
+    std::string name;
+    std::string config_name;
+    unsigned int vlen;
+    unsigned int iter;
+    std::map<std::string, volk_test_time_t> results;
+    std::string best_arch_a;
+    std::string best_arch_u;
  };
  
-class volk_test_params_t {
-    private:
-        float _tol;
-        lv_32fc_t _scalar;
-        unsigned int _vlen;
-        unsigned int _iter;
-        bool _benchmark_mode;
-        bool _absolute_mode;
-        std::string _kernel_regex;
-    public:
-        // ctor
-        volk_test_params_t(float tol, lv_32fc_t scalar, unsigned int vlen, unsigned int iter,
-                           bool benchmark_mode, std::string kernel_regex) :
-            _tol(tol), _scalar(scalar), _vlen(vlen), _iter(iter),
-            _benchmark_mode(benchmark_mode), _absolute_mode(false), _kernel_regex(kernel_regex) {};
-        // setters
-        void set_tol(float tol) {_tol=tol;};
-        void set_scalar(lv_32fc_t scalar) {_scalar=scalar;};
-        void set_vlen(unsigned int vlen) {_vlen=vlen;};
-        void set_iter(unsigned int iter) {_iter=iter;};
-        void set_benchmark(bool benchmark) {_benchmark_mode=benchmark;};
-        void set_regex(std::string regex) {_kernel_regex=regex;};
-        // getters
-        float tol() {return _tol;};
-        lv_32fc_t scalar() {return _scalar;};
-        unsigned int vlen() {return _vlen;};
-        unsigned int iter() {return _iter;};
-        bool benchmark_mode() {return _benchmark_mode;};
-        bool absolute_mode() {return _absolute_mode;};
-        std::string kernel_regex() {return _kernel_regex;};
-        volk_test_params_t make_absolute(float tol) {
-          volk_test_params_t t(*this);
-          t._tol = tol;
-          t._absolute_mode = true;
-          return t;
-        }
-        volk_test_params_t make_tol(float tol) {
-          volk_test_params_t t(*this);
-          t._tol = tol;
-          return t;
-        }
+class volk_test_params_t
+{
+private:
+    float _tol;
+    lv_32fc_t _scalar;
+    unsigned int _vlen;
+    unsigned int _iter;
+    bool _benchmark_mode;
+    bool _absolute_mode;
+    std::string _kernel_regex;
+
+public:
+    // ctor
+    volk_test_params_t(float tol,
+                       lv_32fc_t scalar,
+                       unsigned int vlen,
+                       unsigned int iter,
+                       bool benchmark_mode,
+                       std::string kernel_regex)
+        : _tol(tol),
+          _scalar(scalar),
+          _vlen(vlen),
+          _iter(iter),
+          _benchmark_mode(benchmark_mode),
+          _absolute_mode(false),
+          _kernel_regex(kernel_regex){};
+    // setters
+    void set_tol(float tol) { _tol = tol; };
+    void set_scalar(lv_32fc_t scalar) { _scalar = scalar; };
+    void set_vlen(unsigned int vlen) { _vlen = vlen; };
+    void set_iter(unsigned int iter) { _iter = iter; };
+    void set_benchmark(bool benchmark) { _benchmark_mode = benchmark; };
+    void set_regex(std::string regex) { _kernel_regex = regex; };
+    // getters
+    float tol() { return _tol; };
+    lv_32fc_t scalar() { return _scalar; };
+    unsigned int vlen() { return _vlen; };
+    unsigned int iter() { return _iter; };
+    bool benchmark_mode() { return _benchmark_mode; };
+    bool absolute_mode() { return _absolute_mode; };
+    std::string kernel_regex() { return _kernel_regex; };
+    volk_test_params_t make_absolute(float tol)
+    {
+        volk_test_params_t t(*this);
+        t._tol = tol;
+        t._absolute_mode = true;
+        return t;
+    }
+    volk_test_params_t make_tol(float tol)
+    {
+        volk_test_params_t t(*this);
+        t._tol = tol;
+        return t;
+    }
  };
  
-class volk_test_case_t {
-    private:
-        volk_func_desc_t _desc;
-        void(*_kernel_ptr)();
-        std::string _name;
-        volk_test_params_t _test_parameters;
-        std::string _puppet_master_name;
-    public:
-        volk_func_desc_t desc() {return _desc;};
-        void (*kernel_ptr()) () {return _kernel_ptr;};
-        std::string name() {return _name;};
-        std::string puppet_master_name() {return _puppet_master_name;};
-        volk_test_params_t test_parameters() {return _test_parameters;};
-        // normal ctor
-        volk_test_case_t(volk_func_desc_t desc, void(*kernel_ptr)(), std::string name,
-            volk_test_params_t test_parameters) :
-            _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters),
-            _puppet_master_name("NULL")
-            {};
-        // ctor for puppets
-        volk_test_case_t(volk_func_desc_t desc, void(*kernel_ptr)(), std::string name,
-            std::string puppet_master_name, volk_test_params_t test_parameters) :
-            _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters),
-            _puppet_master_name(puppet_master_name)
-            {};
+class volk_test_case_t
+{
+private:
+    volk_func_desc_t _desc;
+    void (*_kernel_ptr)();
+    std::string _name;
+    volk_test_params_t _test_parameters;
+    std::string _puppet_master_name;
+
+public:
+    volk_func_desc_t desc() { return _desc; };
+    void (*kernel_ptr())() { return _kernel_ptr; };
+    std::string name() { return _name; };
+    std::string puppet_master_name() { return _puppet_master_name; };
+    volk_test_params_t test_parameters() { return _test_parameters; };
+    // normal ctor
+    volk_test_case_t(volk_func_desc_t desc,
+                     void (*kernel_ptr)(),
+                     std::string name,
+                     volk_test_params_t test_parameters)
+        : _desc(desc),
+          _kernel_ptr(kernel_ptr),
+          _name(name),
+          _test_parameters(test_parameters),
+          _puppet_master_name("NULL"){};
+    // ctor for puppets
+    volk_test_case_t(volk_func_desc_t desc,
+                     void (*kernel_ptr)(),
+                     std::string name,
+                     std::string puppet_master_name,
+                     volk_test_params_t test_parameters)
+        : _desc(desc),
+          _kernel_ptr(kernel_ptr),
+          _name(name),
+          _test_parameters(test_parameters),
+          _puppet_master_name(puppet_master_name){};
  };
  
  /************************************************
@@ -117,42 +143,58 @@ class volk_test_case_t {
  volk_type_t volk_type_from_string(std::string);
  
  float uniform(void);
-void random_floats(float *buf, unsigned n);
+void random_floats(float* buf, unsigned n);
  
-bool run_volk_tests(
-    volk_func_desc_t,
-    void(*)(),
-    std::string,
-    volk_test_params_t,
-    std::vector<volk_test_results_t> *results = NULL,
-    std::string puppet_master_name = "NULL"
-    );
+bool run_volk_tests(volk_func_desc_t,
+                    void (*)(),
+                    std::string,
+                    volk_test_params_t,
+                    std::vector<volk_test_results_t>* results = NULL,
+                    std::string puppet_master_name = "NULL");
  
-bool run_volk_tests(
-        volk_func_desc_t,
-        void(*)(),
-        std::string,
-        float,
-        lv_32fc_t,
-        unsigned int,
-        unsigned int,
-        std::vector<volk_test_results_t> *results = NULL,
-        std::string puppet_master_name = "NULL",
-        bool absolute_mode = false,
-        bool benchmark_mode = false
-);
+bool run_volk_tests(volk_func_desc_t,
+                    void (*)(),
+                    std::string,
+                    float,
+                    lv_32fc_t,
+                    unsigned int,
+                    unsigned int,
+                    std::vector<volk_test_results_t>* results = NULL,
+                    std::string puppet_master_name = "NULL",
+                    bool absolute_mode = false,
+                    bool benchmark_mode = false);
  
-#define VOLK_PROFILE(func, test_params, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, "NULL")
-#define VOLK_PUPPET_PROFILE(func, puppet_master_func, test_params, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, std::string(#puppet_master_func))
-typedef void (*volk_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place
-typedef void (*volk_fn_2arg)(void *, void *, unsigned int, const char*);
-typedef void (*volk_fn_3arg)(void *, void *, void *, unsigned int, const char*);
-typedef void (*volk_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*);
-typedef void (*volk_fn_1arg_s32f)(void *, float, unsigned int, const char*); //one input vector, one scalar float input
-typedef void (*volk_fn_2arg_s32f)(void *, void *, float, unsigned int, const char*);
-typedef void (*volk_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char*);
-typedef void (*volk_fn_1arg_s32fc)(void *, lv_32fc_t, unsigned int, const char*); //one input vector, one scalar float input
-typedef void (*volk_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*);
-typedef void (*volk_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*);
+#define VOLK_PROFILE(func, test_params, results) \
+    run_volk_tests(func##_get_func_desc(),       \
+                   (void (*)())func##_manual,    \
+                   std::string(#func),           \
+                   test_params,                  \
+                   results,                      \
+                   "NULL")
+#define VOLK_PUPPET_PROFILE(func, puppet_master_func, test_params, results) \
+    run_volk_tests(func##_get_func_desc(),                                  \
+                   (void (*)())func##_manual,                               \
+                   std::string(#func),                                      \
+                   test_params,                                             \
+                   results,                                                 \
+                   std::string(#puppet_master_func))
+typedef void (*volk_fn_1arg)(void*,
+                             unsigned int,
+                             const char*); // one input, operate in place
+typedef void (*volk_fn_2arg)(void*, void*, unsigned int, const char*);
+typedef void (*volk_fn_3arg)(void*, void*, void*, unsigned int, const char*);
+typedef void (*volk_fn_4arg)(void*, void*, void*, void*, unsigned int, const char*);
+typedef void (*volk_fn_1arg_s32f)(
+    void*, float, unsigned int, const char*); // one input vector, one scalar float input
+typedef void (*volk_fn_2arg_s32f)(void*, void*, float, unsigned int, const char*);
+typedef void (*volk_fn_3arg_s32f)(void*, void*, void*, float, unsigned int, const char*);
+typedef void (*volk_fn_1arg_s32fc)(
+    void*,
+    lv_32fc_t,
+    unsigned int,
+    const char*); // one input vector, one scalar float input
+typedef void (*volk_fn_2arg_s32fc)(void*, void*, lv_32fc_t, unsigned int, const char*);
+typedef void (*volk_fn_3arg_s32fc)(
+    void*, void*, void*, lv_32fc_t, unsigned int, const char*);
  
-#endif //VOLK_QA_UTILS_H
+#endif // VOLK_QA_UTILS_H
diff --git a/lib/testqa.cc b/lib/testqa.cc

index 8b0f4d69778d4eca94ada583217d5908ee7636fe..c8853835f0884b688cd1dea65e07b160d0ba39fd 100644 (file)
--- a/lib/testqa.cc
+++ b/lib/testqa.cc
@@ -20,18 +20,18 @@
   * Boston, MA 02110-1301, USA.
   */
  
-#include <stdbool.h>            // for bool, false, true
-#include <iostream>             // for operator<<, basic_ostream, endl, char...
-#include <fstream>             // IWYU pragma: keep
-#include <map>                  // for map, map<>::iterator, _Rb_tree_iterator
-#include <string>               // for string, operator<<
-#include <utility>              // for pair
-#include <vector>               // for vector
-
+#include <stdbool.h> // for bool, false, true
+#include <fstream>   // IWYU pragma: keep
+#include <iostream>  // for operator<<, basic_ostream, endl, char...
+#include <map>       // for map, map<>::iterator, _Rb_tree_iterator
+#include <string>    // for string, operator<<
+#include <utility>   // for pair
+#include <vector>    // for vector
+
+#include "kernel_tests.h"      // for init_test_list
+#include "qa_utils.h"          // for volk_test_case_t, volk_test_results_t
+#include "volk/volk_complex.h" // for lv_32fc_t
  #include <volk/volk.h>
-#include "kernel_tests.h"       // for init_test_list
-#include "qa_utils.h"           // for volk_test_case_t, volk_test_results_t
-#include "volk/volk_complex.h"  // for lv_32fc_t
  
  void print_qa_xml(std::vector<volk_test_results_t> results, unsigned int nfails);
  
@@ -46,45 +46,52 @@ int main(int argc, char* argv[])
      bool def_benchmark_mode = true;
      std::string def_kernel_regex = "";
  
-    volk_test_params_t test_params(def_tol, def_scalar, def_vlen, def_iter,
-        def_benchmark_mode, def_kernel_regex);
+    volk_test_params_t test_params(
+        def_tol, def_scalar, def_vlen, def_iter, def_benchmark_mode, def_kernel_regex);
      std::vector<volk_test_case_t> test_cases = init_test_list(test_params);
      std::vector<volk_test_results_t> results;
  
-    if (argc > 1){
-        for(unsigned int ii = 0; ii < test_cases.size(); ++ii){
-            if (std::string(argv[1]) == test_cases[ii].name()){
+    if (argc > 1) {
+        for (unsigned int ii = 0; ii < test_cases.size(); ++ii) {
+            if (std::string(argv[1]) == test_cases[ii].name()) {
                  volk_test_case_t test_case = test_cases[ii];
-                if (run_volk_tests(test_case.desc(), test_case.kernel_ptr(),
+                if (run_volk_tests(test_case.desc(),
+                                   test_case.kernel_ptr(),
                                     test_case.name(),
-                                   test_case.test_parameters(), &results,
+                                   test_case.test_parameters(),
+                                   &results,
                                     test_case.puppet_master_name())) {
-                  return 1;
+                    return 1;
                  } else {
-                  return 0;
+                    return 0;
                  }
              }
          }
-        std::cerr << "Did not run a test for kernel: " << std::string(argv[1]) << " !" << std::endl;
+        std::cerr << "Did not run a test for kernel: " << std::string(argv[1]) << " !"
+                  << std::endl;
          return 0;
  
-    }else{
+    } else {
          std::vector<std::string> qa_failures;
          // Test every kernel reporting failures when they occur
-        for(unsigned int ii = 0; ii < test_cases.size(); ++ii) {
+        for (unsigned int ii = 0; ii < test_cases.size(); ++ii) {
              bool qa_result = false;
              volk_test_case_t test_case = test_cases[ii];
              try {
-                qa_result = run_volk_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(),
-                                           test_case.test_parameters(), &results, test_case.puppet_master_name());
-            }
-            catch(...) {
+                qa_result = run_volk_tests(test_case.desc(),
+                                           test_case.kernel_ptr(),
+                                           test_case.name(),
+                                           test_case.test_parameters(),
+                                           &results,
+                                           test_case.puppet_master_name());
+            } catch (...) {
                  // TODO: what exceptions might we need to catch and how do we handle them?
-                std::cerr << "Exception found on kernel: " << test_case.name() << std::endl;
+                std::cerr << "Exception found on kernel: " << test_case.name()
+                          << std::endl;
                  qa_result = false;
              }
  
-            if(qa_result) {
+            if (qa_result) {
                  std::cerr << "Failure on " << test_case.name() << std::endl;
                  qa_failures.push_back(test_case.name());
              }
@@ -96,9 +103,9 @@ int main(int argc, char* argv[])
          // Summarize QA results
          std::cerr << "Kernel QA finished: " << qa_failures.size() << " failures out of "
                    << test_cases.size() << " tests." << std::endl;
-        if(qa_failures.size() > 0) {
+        if (qa_failures.size() > 0) {
              std::cerr << "The following kernels failed QA:" << std::endl;
-            for(unsigned int ii = 0; ii < qa_failures.size(); ++ii) {
+            for (unsigned int ii = 0; ii < qa_failures.size(); ++ii) {
                  std::cerr << "    " << qa_failures[ii] << std::endl;
              }
              qa_ret_val = 1;
@@ -118,26 +125,28 @@ void print_qa_xml(std::vector<volk_test_results_t> results, unsigned int nfails)
      qa_file.open(".unittest/kernels.xml");
  
      qa_file << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" << std::endl;
-    qa_file << "<testsuites name=\"kernels\" " <<
-        "tests=\"" << results.size() << "\" " <<
-        "failures=\"" << nfails << "\" id=\"1\">" << std::endl;
+    qa_file << "<testsuites name=\"kernels\" "
+            << "tests=\"" << results.size() << "\" "
+            << "failures=\"" << nfails << "\" id=\"1\">" << std::endl;
  
      // Results are in a vector by kernel. Each element has a result
      // map containing time and arch name with test result
-    for(unsigned int ii=0; ii < results.size(); ++ii) {
+    for (unsigned int ii = 0; ii < results.size(); ++ii) {
          volk_test_results_t result = results[ii];
          qa_file << "  <testsuite name=\"" << result.name << "\">" << std::endl;
  
          std::map<std::string, volk_test_time_t>::iterator kernel_time_pair;
-        for(kernel_time_pair = result.results.begin(); kernel_time_pair != result.results.end(); ++kernel_time_pair) {
+        for (kernel_time_pair = result.results.begin();
+             kernel_time_pair != result.results.end();
+             ++kernel_time_pair) {
              volk_test_time_t test_time = kernel_time_pair->second;
-            qa_file << "    <testcase name=\"" << test_time.name << "\" " <<
-                "classname=\"" << result.name << "\" " <<
-                "time=\"" << test_time.time << "\">" << std::endl;
-            if(!test_time.pass)
-                qa_file << "      <failure " <<
-                    "message=\"fail on arch " <<  test_time.name << "\">" <<
-                    "</failure>" << std::endl;
+            qa_file << "    <testcase name=\"" << test_time.name << "\" "
+                    << "classname=\"" << result.name << "\" "
+                    << "time=\"" << test_time.time << "\">" << std::endl;
+            if (!test_time.pass)
+                qa_file << "      <failure "
+                        << "message=\"fail on arch " << test_time.name << "\">"
+                        << "</failure>" << std::endl;
              qa_file << "    </testcase>" << std::endl;
          }
          qa_file << "  </testsuite>" << std::endl;
@@ -146,5 +155,4 @@ void print_qa_xml(std::vector<volk_test_results_t> results, unsigned int nfails)
  
      qa_file << "</testsuites>" << std::endl;
      qa_file.close();
-
  }
diff --git a/lib/volk_malloc.c b/lib/volk_malloc.c

index df36240bf411f32a37d3123af15a2dff80e364a7..b3779e1b2eb007b4346d41bb7a2f112d7d1a986b 100644 (file)
--- a/lib/volk_malloc.c
+++ b/lib/volk_malloc.c
@@ -31,7 +31,8 @@
   * see: https://en.cppreference.com/w/c/memory/aligned_alloc
   *
   * MSVC is broken
- * see: https://docs.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=vs-2019
+ * see:
+ * https://docs.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=vs-2019
   * This section:
   * C11 The Universal CRT implemented the parts of the
   * C11 Standard Library that are required by C++17,
@@ -46,39 +47,43 @@
   * We must work around this problem because MSVC is non-compliant!
   */
  
-void *volk_malloc(size_t size, size_t alignment)
+
+void* volk_malloc(size_t size, size_t alignment)
  {
  #if HAVE_POSIX_MEMALIGN
-  // quoting posix_memalign() man page:
-  // "alignment must be a power of two and a multiple of sizeof(void *)"
-  // volk_get_alignment() could return 1 for some machines (e.g. generic_orc)
-  if (alignment == 1){
-    return malloc(size);
-  }
-  void *ptr;
-  int err = posix_memalign(&ptr, alignment, size);
-  if(err != 0) {
-    ptr = NULL;
-    fprintf(stderr,
-            "VOLK: Error allocating memory "
-            "(posix_memalign: error %d: %s)\n", err, strerror(err));
-  }
+    // quoting posix_memalign() man page:
+    // "alignment must be a power of two and a multiple of sizeof(void *)"
+    // volk_get_alignment() could return 1 for some machines (e.g. generic_orc)
+    if (alignment == 1) {
+        return malloc(size);
+    }
+    void* ptr;
+    int err = posix_memalign(&ptr, alignment, size);
+    if (err != 0) {
+        ptr = NULL;
+        fprintf(stderr,
+                "VOLK: Error allocating memory "
+                "(posix_memalign: error %d: %s)\n",
+                err,
+                strerror(err));
+    }
  #elif defined(_MSC_VER)
-  void *ptr = _aligned_malloc(size, alignment);
+    void* ptr = _aligned_malloc(size, alignment);
  #else
-  void *ptr = aligned_alloc(alignment, size);
+    void* ptr = aligned_alloc(alignment, size);
  #endif
-  if(ptr == NULL) {
-    fprintf(stderr, "VOLK: Error allocating memory (aligned_alloc/_aligned_malloc)\n");
-  }
-  return ptr;
+    if (ptr == NULL) {
+        fprintf(stderr,
+                "VOLK: Error allocating memory (aligned_alloc/_aligned_malloc)\n");
+    }
+    return ptr;
  }
  
-void volk_free(void *ptr)
+void volk_free(void* ptr)
  {
  #if defined(_MSC_VER)
-  _aligned_free(ptr);
+    _aligned_free(ptr);
  #else
-  free(ptr);
+    free(ptr);
  #endif
  }
diff --git a/lib/volk_prefs.c b/lib/volk_prefs.c

index 0b5fe8eef493ed922e61d125f4a130952f7a2f88..8934bf7d4fe2a9f6fbd316d3ada3ffc0000ccd26 100644 (file)
--- a/lib/volk_prefs.c
+++ b/lib/volk_prefs.c
@@ -1,6 +1,6 @@
+#include <stdbool.h>
  #include <stdio.h>
  #include <stdlib.h>
-#include <stdbool.h>
  #include <string.h>
  #if defined(_MSC_VER)
  #include <io.h>
@@ -11,82 +11,84 @@
  #endif
  #include <volk/volk_prefs.h>
  
-void volk_get_config_path(char *path, bool read)
+void volk_get_config_path(char* path, bool read)
  {
-    if (!path) return;
-    const char *suffix = "/.volk/volk_config";
-    const char *suffix2 = "/volk/volk_config"; //non-hidden
-    char *home = NULL;
+    if (!path)
+        return;
+    const char* suffix = "/.volk/volk_config";
+    const char* suffix2 = "/volk/volk_config"; // non-hidden
+    char* home = NULL;
  
-    //allows config redirection via env variable
+    // allows config redirection via env variable
      home = getenv("VOLK_CONFIGPATH");
-    if(home!=NULL){
-        strncpy(path,home,512);
-        strcat(path,suffix2);
-        if (!read || access(path, F_OK) != -1){
+    if (home != NULL) {
+        strncpy(path, home, 512);
+        strcat(path, suffix2);
+        if (!read || access(path, F_OK) != -1) {
              return;
          }
      }
  
-    //check for user-local config file
+    // check for user-local config file
      home = getenv("HOME");
-    if (home != NULL){
+    if (home != NULL) {
          strncpy(path, home, 512);
          strcat(path, suffix);
-        if (!read || (access(path, F_OK) != -1)){
+        if (!read || (access(path, F_OK) != -1)) {
              return;
          }
      }
  
-    //check for config file in APPDATA (Windows)
+    // check for config file in APPDATA (Windows)
      home = getenv("APPDATA");
-    if (home != NULL){
+    if (home != NULL) {
          strncpy(path, home, 512);
          strcat(path, suffix);
-        if (!read || (access(path, F_OK) != -1)){
+        if (!read || (access(path, F_OK) != -1)) {
              return;
          }
      }
  
-    //check for system-wide config file
-    if (access("/etc/volk/volk_config", F_OK) != -1){
+    // check for system-wide config file
+    if (access("/etc/volk/volk_config", F_OK) != -1) {
          strncpy(path, "/etc", 512);
          strcat(path, suffix2);
-        if (!read || (access(path, F_OK) != -1)){
+        if (!read || (access(path, F_OK) != -1)) {
              return;
          }
      }
  
-    //If still no path was found set path[0] to '0' and fall through
+    // If still no path was found set path[0] to '0' and fall through
      path[0] = 0;
      return;
  }
  
-size_t volk_load_preferences(volk_arch_pref_t **prefs_res)
+size_t volk_load_preferences(volk_arch_pref_t** prefs_res)
  {
-    FILE *config_file;
+    FILE* config_file;
      char path[512], line[512];
      size_t n_arch_prefs = 0;
-    volk_arch_pref_t *prefs = NULL;
+    volk_arch_pref_t* prefs = NULL;
  
-    //get the config path
+    // get the config path
      volk_get_config_path(path, true);
-    if (!path[0]) return n_arch_prefs; //no prefs found
+    if (!path[0])
+        return n_arch_prefs; // no prefs found
      config_file = fopen(path, "r");
-    if(!config_file) return n_arch_prefs; //no prefs found
+    if (!config_file)
+        return n_arch_prefs; // no prefs found
  
-    //reset the file pointer and write the prefs into volk_arch_prefs
-    while(fgets(line, sizeof(line), config_file) != NULL)
-    {
-        void *new_prefs = realloc(prefs, (n_arch_prefs + 1) * sizeof(*prefs));
+    // reset the file pointer and write the prefs into volk_arch_prefs
+    while (fgets(line, sizeof(line), config_file) != NULL) {
+        void* new_prefs = realloc(prefs, (n_arch_prefs + 1) * sizeof(*prefs));
          if (!new_prefs) {
-            printf ("volk_load_preferences: bad malloc\n");
+            printf("volk_load_preferences: bad malloc\n");
              break;
          }
-        prefs = (volk_arch_pref_t *) new_prefs;
-        volk_arch_pref_t *p = prefs + n_arch_prefs;
-        if(sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_", 5))
-        {
+        prefs = (volk_arch_pref_t*)new_prefs;
+        volk_arch_pref_t* p = prefs + n_arch_prefs;
+        if (sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 &&
+            !strncmp(p->name, "volk_", 5)) {
              n_arch_prefs++;
          }
      }
diff --git a/lib/volk_rank_archs.c b/lib/volk_rank_archs.c

index 346619e254fe75e6b81ea37fb6369870be64c7ec..7cf3fd71328c0ad1161e39a34bb3ec5134f6f1aa 100644 (file)
--- a/lib/volk_rank_archs.c
+++ b/lib/volk_rank_archs.c
@@ -24,84 +24,83 @@
  #include <stdlib.h>
  #include <string.h>
  
-#include <volk_rank_archs.h>
  #include <volk/volk_prefs.h>
+#include <volk_rank_archs.h>
  
-int volk_get_index(
-    const char *impl_names[], //list of implementations by name
-    const size_t n_impls,     //number of implementations available
-    const char *impl_name     //the implementation name to find
-){
+int volk_get_index(const char* impl_names[], // list of implementations by name
+                   const size_t n_impls,     // number of implementations available
+                   const char* impl_name     // the implementation name to find
+)
+{
      unsigned int i;
      for (i = 0; i < n_impls; i++) {
-        if(!strncmp(impl_names[i], impl_name, 20)) {
+        if (!strncmp(impl_names[i], impl_name, 20)) {
              return i;
          }
      }
-    //TODO return -1;
-    //something terrible should happen here
+    // TODO return -1;
+    // something terrible should happen here
      fprintf(stderr, "Volk warning: no arch found, returning generic impl\n");
-    return volk_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now
+    return volk_get_index(impl_names, n_impls, "generic"); // but we'll fake it for now
  }
  
-int volk_rank_archs(
-    const char *kern_name,    //name of the kernel to rank
-    const char *impl_names[], //list of implementations by name
-    const int* impl_deps,     //requirement mask per implementation
-    const bool* alignment,    //alignment status of each implementation
-    size_t n_impls,            //number of implementations available
-    const bool align          //if false, filter aligned implementations
+int volk_rank_archs(const char* kern_name,    // name of the kernel to rank
+                    const char* impl_names[], // list of implementations by name
+                    const int* impl_deps,     // requirement mask per implementation
+                    const bool* alignment,    // alignment status of each implementation
+                    size_t n_impls,           // number of implementations available
+                    const bool align          // if false, filter aligned implementations
  )
  {
      size_t i;
-    static volk_arch_pref_t *volk_arch_prefs;
+    static volk_arch_pref_t* volk_arch_prefs;
      static size_t n_arch_prefs = 0;
      static int prefs_loaded = 0;
-    if(!prefs_loaded) {
+    if (!prefs_loaded) {
          n_arch_prefs = volk_load_preferences(&volk_arch_prefs);
          prefs_loaded = 1;
      }
  
      // If we've defined VOLK_GENERIC to be anything, always return the
      // 'generic' kernel. Used in GR's QA code.
-    char *gen_env = getenv("VOLK_GENERIC");
-    if(gen_env) {
-      return volk_get_index(impl_names, n_impls, "generic");
+    char* gen_env = getenv("VOLK_GENERIC");
+    if (gen_env) {
+        return volk_get_index(impl_names, n_impls, "generic");
      }
  
-    //now look for the function name in the prefs list
-    for(i = 0; i < n_arch_prefs; i++)
-    {
-        if(!strncmp(kern_name, volk_arch_prefs[i].name, sizeof(volk_arch_prefs[i].name))) //found it
+    // now look for the function name in the prefs list
+    for (i = 0; i < n_arch_prefs; i++) {
+        if (!strncmp(kern_name,
+                     volk_arch_prefs[i].name,
+                     sizeof(volk_arch_prefs[i].name))) // found it
          {
-            const char *impl_name = align? volk_arch_prefs[i].impl_a : volk_arch_prefs[i].impl_u;
+            const char* impl_name =
+                align ? volk_arch_prefs[i].impl_a : volk_arch_prefs[i].impl_u;
              return volk_get_index(impl_names, n_impls, impl_name);
          }
      }
  
-    //return the best index with the largest deps
+    // return the best index with the largest deps
      size_t best_index_a = 0;
      size_t best_index_u = 0;
      int best_value_a = -1;
      int best_value_u = -1;
-    for(i = 0; i < n_impls; i++)
-    {
+    for (i = 0; i < n_impls; i++) {
          const signed val = impl_deps[i];
-        if (alignment[i] && val > best_value_a)
-        {
+        if (alignment[i] && val > best_value_a) {
              best_index_a = i;
              best_value_a = val;
          }
-        if (!alignment[i] && val > best_value_u)
-        {
+        if (!alignment[i] && val > best_value_u) {
              best_index_u = i;
              best_value_u = val;
          }
      }
  
-    //when align and we found a best aligned, use it
-    if (align && best_value_a != -1) return best_index_a;
+    // when align and we found a best aligned, use it
+    if (align && best_value_a != -1)
+        return best_index_a;
  
-    //otherwise return the best unaligned
+    // otherwise return the best unaligned
      return best_index_u;
  }
diff --git a/lib/volk_rank_archs.h b/lib/volk_rank_archs.h

index b3bf8ff17cf570b692304c9207383cfc6045cd4e..9434778f3d2efe04b3ebde32a929b6698901c4b5 100644 (file)
--- a/lib/volk_rank_archs.h
+++ b/lib/volk_rank_archs.h
@@ -22,26 +22,24 @@
  #ifndef INCLUDED_VOLK_RANK_ARCHS_H
  #define INCLUDED_VOLK_RANK_ARCHS_H
  
-#include <stdlib.h>
  #include <stdbool.h>
+#include <stdlib.h>
  
  #ifdef __cplusplus
  extern "C" {
  #endif
  
-int volk_get_index(
-    const char *impl_names[], //list of implementations by name
-    const size_t n_impls,     //number of implementations available
-    const char *impl_name     //the implementation name to find
+int volk_get_index(const char* impl_names[], // list of implementations by name
+                   const size_t n_impls,     // number of implementations available
+                   const char* impl_name     // the implementation name to find
  );
  
-int volk_rank_archs(
-    const char *kern_name,    //name of the kernel to rank
-    const char *impl_names[], //list of implementations by name
-    const int* impl_deps,     //requirement mask per implementation
-    const bool* alignment,    //alignment status of each implementation
-    size_t n_impls,            //number of implementations available
-    const bool align          //if false, filter aligned implementations
+int volk_rank_archs(const char* kern_name,    // name of the kernel to rank
+                    const char* impl_names[], // list of implementations by name
+                    const int* impl_deps,     // requirement mask per implementation
+                    const bool* alignment,    // alignment status of each implementation
+                    size_t n_impls,           // number of implementations available
+                    const bool align          // if false, filter aligned implementations
  );
  
  #ifdef __cplusplus
author	Johannes Demel <demel@uni-bremen.de>
	Sun, 23 Feb 2020 14:03:47 +0000 (15:03 +0100)
committer	A. Maitland Bottoms <bottoms@debian.org>
	Sat, 28 Mar 2020 01:48:10 +0000 (01:48 +0000)
.clang-format	[new file with mode: 0644]	patch \| blob
apps/volk-config-info.cc		patch \| blob \| history
apps/volk_option_helpers.cc		patch \| blob \| history
apps/volk_option_helpers.h		patch \| blob \| history
apps/volk_profile.cc		patch \| blob \| history
apps/volk_profile.h		patch \| blob \| history
cmake/msvc/config.h		patch \| blob \| history
cmake/msvc/sys/time.h		patch \| blob \| history
include/volk/saturation_arithmetic.h		patch \| blob \| history
include/volk/volk_alloc.hh		patch \| blob \| history
include/volk/volk_avx2_intrinsics.h		patch \| blob \| history
include/volk/volk_avx_intrinsics.h		patch \| blob \| history
include/volk/volk_common.h		patch \| blob \| history
include/volk/volk_complex.h		patch \| blob \| history
include/volk/volk_malloc.h		patch \| blob \| history
include/volk/volk_neon_intrinsics.h		patch \| blob \| history
include/volk/volk_prefs.h		patch \| blob \| history
include/volk/volk_sse3_intrinsics.h		patch \| blob \| history
include/volk/volk_sse_intrinsics.h		patch \| blob \| history
kernels/volk/volk_16i_32fc_dot_prod_32fc.h		patch \| blob \| history
kernels/volk/volk_16i_branch_4_state_8.h		patch \| blob \| history
kernels/volk/volk_16i_convert_8i.h		patch \| blob \| history
kernels/volk/volk_16i_max_star_16i.h		patch \| blob \| history
kernels/volk/volk_16i_max_star_horizontal_16i.h		patch \| blob \| history
kernels/volk/volk_16i_permute_and_scalar_add.h		patch \| blob \| history
kernels/volk/volk_16i_s32f_convert_32f.h		patch \| blob \| history
kernels/volk/volk_16i_x4_quad_max_star_16i.h		patch \| blob \| history
kernels/volk/volk_16i_x5_add_quad_16i_x4.h		patch \| blob \| history
kernels/volk/volk_16ic_convert_32fc.h		patch \| blob \| history
kernels/volk/volk_16ic_deinterleave_16i_x2.h		patch \| blob \| history
kernels/volk/volk_16ic_deinterleave_real_16i.h		patch \| blob \| history
kernels/volk/volk_16ic_deinterleave_real_8i.h		patch \| blob \| history
kernels/volk/volk_16ic_magnitude_16i.h		patch \| blob \| history
kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h		patch \| blob \| history
kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h		patch \| blob \| history
kernels/volk/volk_16ic_s32f_magnitude_32f.h		patch \| blob \| history
kernels/volk/volk_16ic_x2_dot_prod_16ic.h		patch \| blob \| history
kernels/volk/volk_16ic_x2_multiply_16ic.h		patch \| blob \| history
kernels/volk/volk_16u_byteswap.h		patch \| blob \| history
kernels/volk/volk_16u_byteswappuppet_16u.h		patch \| blob \| history
kernels/volk/volk_32f_64f_add_64f.h		patch \| blob \| history
kernels/volk/volk_32f_64f_multiply_64f.h		patch \| blob \| history
kernels/volk/volk_32f_8u_polarbutterfly_32f.h		patch \| blob \| history
kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h		patch \| blob \| history
kernels/volk/volk_32f_accumulator_s32f.h		patch \| blob \| history
kernels/volk/volk_32f_acos_32f.h		patch \| blob \| history
kernels/volk/volk_32f_asin_32f.h		patch \| blob \| history
kernels/volk/volk_32f_atan_32f.h		patch \| blob \| history
kernels/volk/volk_32f_binary_slicer_32i.h		patch \| blob \| history
kernels/volk/volk_32f_binary_slicer_8i.h		patch \| blob \| history
kernels/volk/volk_32f_convert_64f.h		patch \| blob \| history
kernels/volk/volk_32f_cos_32f.h		patch \| blob \| history
kernels/volk/volk_32f_expfast_32f.h		patch \| blob \| history
kernels/volk/volk_32f_index_max_16u.h		patch \| blob \| history
kernels/volk/volk_32f_index_max_32u.h		patch \| blob \| history
kernels/volk/volk_32f_invsqrt_32f.h		patch \| blob \| history
kernels/volk/volk_32f_log2_32f.h		patch \| blob \| history
kernels/volk/volk_32f_null_32f.h		patch \| blob \| history
kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h		patch \| blob \| history
kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h		patch \| blob \| history
kernels/volk/volk_32f_s32f_convert_16i.h		patch \| blob \| history
kernels/volk/volk_32f_s32f_convert_32i.h		patch \| blob \| history
kernels/volk/volk_32f_s32f_convert_8i.h		patch \| blob \| history
kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h		patch \| blob \| history
kernels/volk/volk_32f_s32f_multiply_32f.h		patch \| blob \| history
kernels/volk/volk_32f_s32f_normalize.h		patch \| blob \| history
kernels/volk/volk_32f_s32f_power_32f.h		patch \| blob \| history
kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h		patch \| blob \| history
kernels/volk/volk_32f_s32f_stddev_32f.h		patch \| blob \| history
kernels/volk/volk_32f_sin_32f.h		patch \| blob \| history
kernels/volk/volk_32f_sqrt_32f.h		patch \| blob \| history
kernels/volk/volk_32f_stddev_and_mean_32f_x2.h		patch \| blob \| history
kernels/volk/volk_32f_tan_32f.h		patch \| blob \| history
kernels/volk/volk_32f_tanh_32f.h		patch \| blob \| history
kernels/volk/volk_32f_x2_add_32f.h		patch \| blob \| history
kernels/volk/volk_32f_x2_divide_32f.h		patch \| blob \| history
kernels/volk/volk_32f_x2_dot_prod_16i.h		patch \| blob \| history
kernels/volk/volk_32f_x2_dot_prod_32f.h		patch \| blob \| history
kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h		patch \| blob \| history
kernels/volk/volk_32f_x2_interleave_32fc.h		patch \| blob \| history
kernels/volk/volk_32f_x2_max_32f.h		patch \| blob \| history
kernels/volk/volk_32f_x2_min_32f.h		patch \| blob \| history
kernels/volk/volk_32f_x2_multiply_32f.h		patch \| blob \| history
kernels/volk/volk_32f_x2_pow_32f.h		patch \| blob \| history
kernels/volk/volk_32f_x2_s32f_interleave_16ic.h		patch \| blob \| history
kernels/volk/volk_32f_x2_subtract_32f.h		patch \| blob \| history
kernels/volk/volk_32f_x3_sum_of_poly_32f.h		patch \| blob \| history
kernels/volk/volk_32fc_32f_add_32fc.h		patch \| blob \| history
kernels/volk/volk_32fc_32f_dot_prod_32fc.h		patch \| blob \| history
kernels/volk/volk_32fc_32f_multiply_32fc.h		patch \| blob \| history
kernels/volk/volk_32fc_conjugate_32fc.h		patch \| blob \| history
kernels/volk/volk_32fc_convert_16ic.h		patch \| blob \| history
kernels/volk/volk_32fc_deinterleave_32f_x2.h		patch \| blob \| history
kernels/volk/volk_32fc_deinterleave_64f_x2.h		patch \| blob \| history
kernels/volk/volk_32fc_deinterleave_imag_32f.h		patch \| blob \| history
kernels/volk/volk_32fc_deinterleave_real_32f.h		patch \| blob \| history
kernels/volk/volk_32fc_deinterleave_real_64f.h		patch \| blob \| history
kernels/volk/volk_32fc_index_max_16u.h		patch \| blob \| history
kernels/volk/volk_32fc_index_max_32u.h		patch \| blob \| history
kernels/volk/volk_32fc_magnitude_32f.h		patch \| blob \| history
kernels/volk/volk_32fc_magnitude_squared_32f.h		patch \| blob \| history
kernels/volk/volk_32fc_s32f_atan2_32f.h		patch \| blob \| history
kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h		patch \| blob \| history
kernels/volk/volk_32fc_s32f_magnitude_16i.h		patch \| blob \| history
kernels/volk/volk_32fc_s32f_power_32fc.h		patch \| blob \| history
kernels/volk/volk_32fc_s32f_power_spectrum_32f.h		patch \| blob \| history
kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h		patch \| blob \| history
kernels/volk/volk_32fc_s32fc_multiply_32fc.h		patch \| blob \| history
kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h		patch \| blob \| history
kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h		patch \| blob \| history
kernels/volk/volk_32fc_x2_add_32fc.h		patch \| blob \| history
kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h		patch \| blob \| history
kernels/volk/volk_32fc_x2_divide_32fc.h		patch \| blob \| history
kernels/volk/volk_32fc_x2_dot_prod_32fc.h		patch \| blob \| history
kernels/volk/volk_32fc_x2_multiply_32fc.h		patch \| blob \| history
kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h		patch \| blob \| history
kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h		patch \| blob \| history
kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h		patch \| blob \| history
kernels/volk/volk_32fc_x2_square_dist_32f.h		patch \| blob \| history
kernels/volk/volk_32i_s32f_convert_32f.h		patch \| blob \| history
kernels/volk/volk_32i_x2_and_32i.h		patch \| blob \| history
kernels/volk/volk_32i_x2_or_32i.h		patch \| blob \| history
kernels/volk/volk_32u_byteswap.h		patch \| blob \| history
kernels/volk/volk_32u_byteswappuppet_32u.h		patch \| blob \| history
kernels/volk/volk_32u_popcnt.h		patch \| blob \| history
kernels/volk/volk_32u_popcntpuppet_32u.h		patch \| blob \| history
kernels/volk/volk_32u_reverse_32u.h		patch \| blob \| history
kernels/volk/volk_64f_convert_32f.h		patch \| blob \| history
kernels/volk/volk_64f_x2_add_64f.h		patch \| blob \| history
kernels/volk/volk_64f_x2_max_64f.h		patch \| blob \| history
kernels/volk/volk_64f_x2_min_64f.h		patch \| blob \| history
kernels/volk/volk_64f_x2_multiply_64f.h		patch \| blob \| history
kernels/volk/volk_64u_byteswap.h		patch \| blob \| history
kernels/volk/volk_64u_byteswappuppet_64u.h		patch \| blob \| history
kernels/volk/volk_64u_popcnt.h		patch \| blob \| history
kernels/volk/volk_64u_popcntpuppet_64u.h		patch \| blob \| history
kernels/volk/volk_8i_convert_16i.h		patch \| blob \| history
kernels/volk/volk_8i_s32f_convert_32f.h		patch \| blob \| history
kernels/volk/volk_8ic_deinterleave_16i_x2.h		patch \| blob \| history
kernels/volk/volk_8ic_deinterleave_real_16i.h		patch \| blob \| history
kernels/volk/volk_8ic_deinterleave_real_8i.h		patch \| blob \| history
kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h		patch \| blob \| history
kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h		patch \| blob \| history
kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h		patch \| blob \| history
kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h		patch \| blob \| history
kernels/volk/volk_8u_conv_k7_r2puppet_8u.h		patch \| blob \| history
kernels/volk/volk_8u_x2_encodeframepolar_8u.h		patch \| blob \| history
kernels/volk/volk_8u_x3_encodepolar_8u_x2.h		patch \| blob \| history
kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h		patch \| blob \| history
kernels/volk/volk_8u_x4_conv_k7_r2_8u.h		patch \| blob \| history
lib/kernel_tests.h		patch \| blob \| history
lib/qa_utils.cc		patch \| blob \| history
lib/qa_utils.h		patch \| blob \| history
lib/testqa.cc		patch \| blob \| history
lib/volk_malloc.c		patch \| blob \| history
lib/volk_prefs.c		patch \| blob \| history
lib/volk_rank_archs.c		patch \| blob \| history
lib/volk_rank_archs.h		patch \| blob \| history