This commit adds `.clang-format` from GNU Radio and apply clang-format.
Run:
`find . -regex '.*\.\(c\|cc\|cpp\|cxx\|h\|hh\)' -exec clang-format \
-style=file -i {} \;`
in `.`.
Gbp-Pq: Name 0003-clang-format-Apply-clang-format.patch
--- /dev/null
+---
+Language: Cpp
+# BasedOnStyle: LLVM
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands: true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BreakBeforeBraces: Custom
+BraceWrapping:
+ AfterClass: true
+ AfterControlStatement: false
+ AfterEnum: false
+ AfterFunction: true
+ AfterNamespace: false
+ AfterObjCDeclaration: false
+ AfterStruct: false
+ AfterUnion: false
+ BeforeCatch: false
+ BeforeElse: false
+ IndentBraces: false
+BreakBeforeBinaryOperators: None
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit: 90
+CommentPragmas: '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+DisableFormat: false
+ExperimentalAutoDetectBinPacking: false
+ForEachMacros:
+ - foreach
+ - Q_FOREACH
+ - BOOST_FOREACH
+IncludeCategories:
+ - Regex: '^"(gnuradio)/'
+ Priority: 1
+ - Regex: '^<(gnuradio)/'
+ Priority: 2
+ - Regex: '^<(boost)/'
+ Priority: 98
+ - Regex: '^<[a-z]*>$'
+ Priority: 99
+ - Regex: '^".*"$'
+ Priority: 0
+ - Regex: '.*'
+ Priority: 10
+
+IncludeIsMainRegex: '(Test)?$'
+IndentCaseLabels: false
+IndentWidth: 4
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd: ''
+MaxEmptyLinesToKeep: 2
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Left
+ReflowComments: true
+SortIncludes: true
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp11
+TabWidth: 8
+UseTab: Never
+
+
#include <config.h>
#endif
-#include <volk/constants.h> // for volk_available_machines, volk_c_com...
-#include <iostream> // for operator<<, endl, cout, ostream
-#include <string> // for string
+#include <volk/constants.h> // for volk_available_machines, volk_c_com...
+#include <iostream> // for operator<<, endl, cout, ostream
+#include <string> // for string
-#include "volk/volk.h" // for volk_get_alignment, volk_get_machine
-#include "volk_option_helpers.h" // for option_list, option_t
+#include "volk/volk.h" // for volk_get_alignment, volk_get_machine
+#include "volk_option_helpers.h" // for option_list, option_t
void print_alignment()
{
- std::cout << "Alignment in bytes: " << volk_get_alignment() << std::endl;
+ std::cout << "Alignment in bytes: " << volk_get_alignment() << std::endl;
}
void print_malloc()
{
- // You don't want to change the volk_malloc code, so just copy the if/else
- // structure from there and give an explanation for the implementations
- std::cout << "Used malloc implementation: ";
- #if HAVE_POSIX_MEMALIGN
- std::cout << "posix_memalign" << std::endl;
- #elif defined(_MSC_VER)
- std::cout << "_aligned_malloc" << std::endl;
- #else
- std::cout << "C11 aligned_alloc" << std::endl;
- #endif
+ // You don't want to change the volk_malloc code, so just copy the if/else
+ // structure from there and give an explanation for the implementations
+ std::cout << "Used malloc implementation: ";
+#if HAVE_POSIX_MEMALIGN
+ std::cout << "posix_memalign" << std::endl;
+#elif defined(_MSC_VER)
+ std::cout << "_aligned_malloc" << std::endl;
+#else
+ std::cout << "C11 aligned_alloc" << std::endl;
+#endif
}
-int
-main(int argc, char **argv)
+int main(int argc, char** argv)
{
- option_list our_options("volk-config-info");
- our_options.add(option_t("prefix", "", "print the VOLK installation prefix", volk_prefix()));
- our_options.add(option_t("cc", "", "print the VOLK C compiler version", volk_c_compiler()));
- our_options.add(option_t("cflags", "", "print the VOLK CFLAGS", volk_compiler_flags()));
- our_options.add(option_t("all-machines", "", "print VOLK machines built", volk_available_machines()));
- our_options.add(option_t("avail-machines", "", "print VOLK machines on the current "
- "platform", volk_list_machines));
- our_options.add(option_t("machine", "", "print the current VOLK machine that will be used",
- volk_get_machine()));
- our_options.add(option_t("alignment", "", "print the memory alignment", print_alignment));
- our_options.add(option_t("malloc", "", "print the malloc implementation used in volk_malloc",
- print_malloc));
- our_options.add(option_t("version", "v", "print the VOLK version", volk_version()));
+ option_list our_options("volk-config-info");
+ our_options.add(
+ option_t("prefix", "", "print the VOLK installation prefix", volk_prefix()));
+ our_options.add(
+ option_t("cc", "", "print the VOLK C compiler version", volk_c_compiler()));
+ our_options.add(
+ option_t("cflags", "", "print the VOLK CFLAGS", volk_compiler_flags()));
+ our_options.add(option_t(
+ "all-machines", "", "print VOLK machines built", volk_available_machines()));
+ our_options.add(option_t("avail-machines",
+ "",
+ "print VOLK machines on the current "
+ "platform",
+ volk_list_machines));
+ our_options.add(option_t("machine",
+ "",
+ "print the current VOLK machine that will be used",
+ volk_get_machine()));
+ our_options.add(
+ option_t("alignment", "", "print the memory alignment", print_alignment));
+ our_options.add(option_t("malloc",
+ "",
+ "print the malloc implementation used in volk_malloc",
+ print_malloc));
+ our_options.add(option_t("version", "v", "print the VOLK version", volk_version()));
- our_options.parse(argc, argv);
+ our_options.parse(argc, argv);
- return 0;
+ return 0;
}
#include "volk_option_helpers.h"
-#include <exception> // for exception
-#include <iostream> // for operator<<, endl, basic_ostream, cout, ostream
-#include <utility> // for pair
-#include <limits.h> // IWYU pragma: keep
-#include <cstring> // IWYU pragma: keep
-#include <cstdlib> // IWYU pragma: keep
+#include <limits.h> // IWYU pragma: keep
+#include <cstdlib> // IWYU pragma: keep
+#include <cstring> // IWYU pragma: keep
+#include <exception> // for exception
+#include <iostream> // for operator<<, endl, basic_ostream, cout, ostream
+#include <utility> // for pair
/*
* Option type
*/
-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)())
- : longform("--" + longform),
- shortform("-" + shortform),
- msg(msg),
- callback(callback) { option_type = VOID_CALLBACK; }
-
-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int))
- : longform("--" + longform),
- shortform("-" + shortform),
- msg(msg),
- callback((void (*)()) callback) { option_type = INT_CALLBACK; }
-
-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(float))
- : longform("--" + longform),
- shortform("-" + shortform),
- msg(msg),
- callback((void (*)()) callback) { option_type = FLOAT_CALLBACK; }
-
-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(bool))
- : longform("--" + longform),
- shortform("-" + shortform),
- msg(msg),
- callback((void (*)()) callback) { option_type = BOOL_CALLBACK; }
-
-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(std::string))
- : longform("--" + longform),
- shortform("-" + shortform),
- msg(msg),
- callback((void (*)()) callback) { option_type = STRING_CALLBACK; }
-
-option_t::option_t(std::string longform, std::string shortform, std::string msg, std::string printval)
- : longform("--" + longform),
- shortform("-" + shortform),
- msg(msg),
- printval(printval) { option_type = STRING; }
+option_t::option_t(std::string longform,
+ std::string shortform,
+ std::string msg,
+ void (*callback)())
+ : longform("--" + longform), shortform("-" + shortform), msg(msg), callback(callback)
+{
+ option_type = VOID_CALLBACK;
+}
+
+option_t::option_t(std::string longform,
+ std::string shortform,
+ std::string msg,
+ void (*callback)(int))
+ : longform("--" + longform),
+ shortform("-" + shortform),
+ msg(msg),
+ callback((void (*)())callback)
+{
+ option_type = INT_CALLBACK;
+}
+
+option_t::option_t(std::string longform,
+ std::string shortform,
+ std::string msg,
+ void (*callback)(float))
+ : longform("--" + longform),
+ shortform("-" + shortform),
+ msg(msg),
+ callback((void (*)())callback)
+{
+ option_type = FLOAT_CALLBACK;
+}
+
+option_t::option_t(std::string longform,
+ std::string shortform,
+ std::string msg,
+ void (*callback)(bool))
+ : longform("--" + longform),
+ shortform("-" + shortform),
+ msg(msg),
+ callback((void (*)())callback)
+{
+ option_type = BOOL_CALLBACK;
+}
+
+option_t::option_t(std::string longform,
+ std::string shortform,
+ std::string msg,
+ void (*callback)(std::string))
+ : longform("--" + longform),
+ shortform("-" + shortform),
+ msg(msg),
+ callback((void (*)())callback)
+{
+ option_type = STRING_CALLBACK;
+}
+
+option_t::option_t(std::string longform,
+ std::string shortform,
+ std::string msg,
+ std::string printval)
+ : longform("--" + longform), shortform("-" + shortform), msg(msg), printval(printval)
+{
+ option_type = STRING;
+}
/*
* Option List
*/
-option_list::option_list(std::string program_name) :
- program_name(program_name) {
+option_list::option_list(std::string program_name) : program_name(program_name)
+{
internal_list = std::vector<option_t>();
}
void option_list::add(option_t opt) { internal_list.push_back(opt); }
-void option_list::parse(int argc, char **argv) {
+void option_list::parse(int argc, char** argv)
+{
for (int arg_number = 0; arg_number < argc; ++arg_number) {
for (std::vector<option_t>::iterator this_option = internal_list.begin();
this_option != internal_list.end();
this_option->shortform == std::string(argv[arg_number])) {
if (present_options.count(this_option->longform) == 0) {
- present_options.insert(std::pair<std::string, int>(this_option->longform, 1));
+ present_options.insert(
+ std::pair<std::string, int>(this_option->longform, 1));
} else {
present_options[this_option->longform] += 1;
}
switch (this_option->option_type) {
- case VOID_CALLBACK:
- this_option->callback();
- break;
- case INT_CALLBACK:
- try {
- int_val = atoi(argv[++arg_number]);
- ((void (*)(int)) this_option->callback)(int_val);
- } catch (std::exception &exc) {
- std::cout << "An int option can only receive a number" << std::endl;
- throw std::exception();
- };
- break;
- case FLOAT_CALLBACK:
- try {
- double double_val = atof(argv[++arg_number]);
- ((void (*)(float)) this_option->callback)(double_val);
- } catch (std::exception &exc) {
- std::cout << "A float option can only receive a number" << std::endl;
- throw std::exception();
- };
- break;
- case BOOL_CALLBACK:
- try {
- if (arg_number == (argc - 1)) { // this is the last arg
+ case VOID_CALLBACK:
+ this_option->callback();
+ break;
+ case INT_CALLBACK:
+ try {
+ int_val = atoi(argv[++arg_number]);
+ ((void (*)(int))this_option->callback)(int_val);
+ } catch (std::exception& exc) {
+ std::cout << "An int option can only receive a number"
+ << std::endl;
+ throw std::exception();
+ };
+ break;
+ case FLOAT_CALLBACK:
+ try {
+ double double_val = atof(argv[++arg_number]);
+ ((void (*)(float))this_option->callback)(double_val);
+ } catch (std::exception& exc) {
+ std::cout << "A float option can only receive a number"
+ << std::endl;
+ throw std::exception();
+ };
+ break;
+ case BOOL_CALLBACK:
+ try {
+ if (arg_number == (argc - 1)) { // this is the last arg
+ int_val = 1;
+ } else { // sneak a look at the next arg since it's present
+ char* next_arg = argv[arg_number + 1];
+ if ((strncmp(next_arg, "-", 1) == 0) ||
+ (strncmp(next_arg, "--", 2) == 0)) {
+ // the next arg is actually an arg, the bool is just
+ // present, set to true
+ int_val = 1;
+ } else if (strncmp(next_arg, "true", 4) == 0) {
int_val = 1;
- } else { // sneak a look at the next arg since it's present
- char *next_arg = argv[arg_number + 1];
- if ((strncmp(next_arg, "-", 1) == 0) || (strncmp(next_arg, "--", 2) == 0)) {
- // the next arg is actually an arg, the bool is just present, set to true
- int_val = 1;
- } else if (strncmp(next_arg, "true", 4) == 0) {
- int_val = 1;
- } else if (strncmp(next_arg, "false", 5) == 0) {
- int_val = 0;
- } else {
- // we got a number or a string.
- // convert it to a number and depend on the catch to report an error condition
- int_val = (bool) atoi(argv[++arg_number]);
- }
+ } else if (strncmp(next_arg, "false", 5) == 0) {
+ int_val = 0;
+ } else {
+ // we got a number or a string.
+ // convert it to a number and depend on the catch to
+ // report an error condition
+ int_val = (bool)atoi(argv[++arg_number]);
}
- } catch (std::exception &e) {
- int_val = INT_MIN;
- };
- if (int_val == INT_MIN) {
- std::cout << "option: '" << argv[arg_number - 1] << "' -> received an unknown value. Boolean "
- "options should receive one of '0', '1', 'true', 'false'." << std::endl;
- throw std::exception();
- } else if (int_val) {
- ((void (*)(bool)) this_option->callback)(int_val);
}
- break;
- case STRING_CALLBACK:
- try {
- ((void (*)(std::string)) this_option->callback)(argv[++arg_number]);
- } catch (std::exception &exc) {
- throw std::exception();
- };
- case STRING:
- std::cout << this_option->printval << std::endl;
- break;
+ } catch (std::exception& e) {
+ int_val = INT_MIN;
+ };
+ if (int_val == INT_MIN) {
+ std::cout
+ << "option: '" << argv[arg_number - 1]
+ << "' -> received an unknown value. Boolean "
+ "options should receive one of '0', '1', 'true', 'false'."
+ << std::endl;
+ throw std::exception();
+ } else if (int_val) {
+ ((void (*)(bool))this_option->callback)(int_val);
+ }
+ break;
+ case STRING_CALLBACK:
+ try {
+ ((void (*)(std::string))this_option->callback)(
+ argv[++arg_number]);
+ } catch (std::exception& exc) {
+ throw std::exception();
+ };
+ case STRING:
+ std::cout << this_option->printval << std::endl;
+ break;
}
}
-
}
if (std::string("--help") == std::string(argv[arg_number]) ||
std::string("-h") == std::string(argv[arg_number])) {
}
}
-bool option_list::present(std::string option_name) {
+bool option_list::present(std::string option_name)
+{
if (present_options.count("--" + option_name)) {
return true;
} else {
}
}
-void option_list::help() {
+void option_list::help()
+{
std::cout << program_name << std::endl;
std::cout << " -h [ --help ] \t\tdisplay this help message" << std::endl;
for (std::vector<option_t>::iterator this_option = internal_list.begin();
}
switch (help_line.size() / 8) {
- case 0:
- help_line += "\t";
- case 1:
- help_line += "\t";
- case 2:
- help_line += "\t";
- case 3:
- help_line += "\t";
+ case 0:
+ help_line += "\t";
+ case 1:
+ help_line += "\t";
+ case 2:
+ help_line += "\t";
+ case 3:
+ help_line += "\t";
}
help_line += this_option->msg;
std::cout << help_line << std::endl;
#ifndef VOLK_VOLK_OPTION_HELPERS_H
#define VOLK_VOLK_OPTION_HELPERS_H
-#include <string>
-#include <cstring>
#include <limits.h>
-#include <vector>
+#include <cstring>
#include <map>
+#include <string>
+#include <vector>
-typedef enum
-{
- VOID_CALLBACK,
+typedef enum {
+ VOID_CALLBACK,
INT_CALLBACK,
BOOL_CALLBACK,
STRING_CALLBACK,
FLOAT_CALLBACK,
- STRING,
+ STRING,
} VOLK_OPTYPE;
-class option_t {
- public:
- option_t(std::string longform, std::string shortform, std::string msg, void (*callback)());
- option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int));
- option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(float));
- option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(bool));
- option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(std::string));
- option_t(std::string longform, std::string shortform, std::string msg, std::string printval);
-
- std::string longform;
- std::string shortform;
- std::string msg;
- VOLK_OPTYPE option_type;
- std::string printval;
- void (*callback)();
+class option_t
+{
+public:
+ option_t(std::string longform,
+ std::string shortform,
+ std::string msg,
+ void (*callback)());
+ option_t(std::string longform,
+ std::string shortform,
+ std::string msg,
+ void (*callback)(int));
+ option_t(std::string longform,
+ std::string shortform,
+ std::string msg,
+ void (*callback)(float));
+ option_t(std::string longform,
+ std::string shortform,
+ std::string msg,
+ void (*callback)(bool));
+ option_t(std::string longform,
+ std::string shortform,
+ std::string msg,
+ void (*callback)(std::string));
+ option_t(std::string longform,
+ std::string shortform,
+ std::string msg,
+ std::string printval);
+ std::string longform;
+ std::string shortform;
+ std::string msg;
+ VOLK_OPTYPE option_type;
+ std::string printval;
+ void (*callback)();
};
class option_list
{
- public:
- option_list(std::string program_name);
- bool present(std::string option_name);
+public:
+ option_list(std::string program_name);
+ bool present(std::string option_name);
+
+ void add(option_t opt);
- void add(option_t opt);
+ void parse(int argc, char** argv);
- void parse(int argc, char **argv);
+ void help();
- void help();
- private:
- std::string program_name;
- std::vector<option_t> internal_list;
- std::map<std::string, int> present_options;
+private:
+ std::string program_name;
+ std::vector<option_t> internal_list;
+ std::map<std::string, int> present_options;
};
-#endif //VOLK_VOLK_OPTION_HELPERS_H
+#endif // VOLK_VOLK_OPTION_HELPERS_H
#include <filesystem>
#endif
#else
-#include <boost/filesystem/operations.hpp> // for create_directories, exists
-#include <boost/filesystem/path.hpp> // for path, operator<<
-#include <boost/filesystem/path_traits.hpp> // for filesystem
+#include <boost/filesystem/operations.hpp> // for create_directories, exists
+#include <boost/filesystem/path.hpp> // for path, operator<<
+#include <boost/filesystem/path_traits.hpp> // for filesystem
#endif
-#include <stddef.h> // for size_t
-#include <sys/stat.h> // for stat
-#include <volk/volk_prefs.h> // for volk_get_config_path
-#include <iostream> // for operator<<, basic_ostream
-#include <fstream> // IWYU pragma: keep
-#include <map> // for map, map<>::iterator
-#include <utility> // for pair
-#include <vector> // for vector, vector<>::const_...
-
-#include "kernel_tests.h" // for init_test_list
-#include "qa_utils.h" // for volk_test_results_t, vol...
-#include "volk/volk_complex.h" // for lv_32fc_t
-#include "volk_option_helpers.h" // for option_list, option_t
+#include <stddef.h> // for size_t
+#include <sys/stat.h> // for stat
+#include <volk/volk_prefs.h> // for volk_get_config_path
+#include <fstream> // IWYU pragma: keep
+#include <iostream> // for operator<<, basic_ostream
+#include <map> // for map, map<>::iterator
+#include <utility> // for pair
+#include <vector> // for vector, vector<>::const_...
+
+#include "kernel_tests.h" // for init_test_list
+#include "qa_utils.h" // for volk_test_results_t, vol...
+#include "volk/volk_complex.h" // for lv_32fc_t
+#include "volk_option_helpers.h" // for option_list, option_t
#include "volk_profile.h"
#if HAS_STD_FILESYSTEM
std::string volk_config_path("");
void set_volk_config(std::string val) { volk_config_path = val; }
-int main(int argc, char *argv[]) {
+int main(int argc, char* argv[])
+{
option_list profile_options("volk_profile");
- profile_options.add(option_t("benchmark", "b", "Run all kernels (benchmark mode)", set_benchmark));
- profile_options.add(option_t("tol", "t", "Set the default tolerance for all tests", set_tolerance));
- profile_options.add(option_t("vlen", "v", "Set the default vector length for tests", set_vlen));
- profile_options.add((option_t("iter", "i", "Set the default number of test iterations per kernel", set_iter)));
- profile_options.add((option_t("tests-substr", "R", "Run tests matching substring", set_substr)));
- profile_options.add((option_t("update", "u", "Run only kernels missing from config", set_update)));
- profile_options.add((option_t("dry-run", "n", "Dry run. Respect other options, but don't write to file", set_dryrun)));
- profile_options.add((option_t("json", "j", "Write results to JSON file named as argument value", set_json)));
- profile_options.add((option_t("path", "p", "Specify the volk_config path", set_volk_config)));
+ profile_options.add(
+ option_t("benchmark", "b", "Run all kernels (benchmark mode)", set_benchmark));
+ profile_options.add(
+ option_t("tol", "t", "Set the default tolerance for all tests", set_tolerance));
+ profile_options.add(
+ option_t("vlen", "v", "Set the default vector length for tests", set_vlen));
+ profile_options.add((option_t(
+ "iter", "i", "Set the default number of test iterations per kernel", set_iter)));
+ profile_options.add(
+ (option_t("tests-substr", "R", "Run tests matching substring", set_substr)));
+ profile_options.add(
+ (option_t("update", "u", "Run only kernels missing from config", set_update)));
+ profile_options.add(
+ (option_t("dry-run",
+ "n",
+ "Dry run. Respect other options, but don't write to file",
+ set_dryrun)));
+ profile_options.add((option_t(
+ "json", "j", "Write results to JSON file named as argument value", set_json)));
+ profile_options.add(
+ (option_t("path", "p", "Specify the volk_config path", set_volk_config)));
profile_options.parse(argc, argv);
if (profile_options.present("help")) {
return 0;
}
- if(dry_run) {
- std::cout << "Warning: this IS a dry-run. Config will not be written!" << std::endl;
+ if (dry_run) {
+ std::cout << "Warning: this IS a dry-run. Config will not be written!"
+ << std::endl;
}
// Adding program options
std::ofstream json_file;
std::string config_file;
- if ( json_filename != "" ) {
- json_file.open( json_filename.c_str() );
+ if (json_filename != "") {
+ json_file.open(json_filename.c_str());
}
- if ( volk_config_path != "" ) {
+ if (volk_config_path != "") {
config_file = volk_config_path + "/volk_config";
}
// Run tests
std::vector<volk_test_results_t> results;
- if(update_mode) {
- if( config_file != "" ) read_results(&results, config_file);
- else read_results(&results);
+ if (update_mode) {
+ if (config_file != "")
+ read_results(&results, config_file);
+ else
+ read_results(&results);
}
// Initialize the list of tests
// Iterate through list of tests running each one
std::string substr_to_match(test_params.kernel_regex());
- for(unsigned int ii = 0; ii < test_cases.size(); ++ii) {
+ for (unsigned int ii = 0; ii < test_cases.size(); ++ii) {
bool regex_match = true;
volk_test_case_t test_case = test_cases[ii];
// if the kernel name matches regex then do the test
std::string test_case_name = test_case.name();
- if(test_case_name.find(substr_to_match) == std::string::npos) {
+ if (test_case_name.find(substr_to_match) == std::string::npos) {
regex_match = false;
}
// if we are in update mode check if we've already got results
// if we have any, then no need to test that kernel
bool update = true;
- if(update_mode) {
- for(unsigned int jj=0; jj < results.size(); ++jj) {
- if(results[jj].name == test_case.name() ||
+ if (update_mode) {
+ for (unsigned int jj = 0; jj < results.size(); ++jj) {
+ if (results[jj].name == test_case.name() ||
results[jj].name == test_case.puppet_master_name()) {
update = false;
break;
}
}
- if( regex_match && update ) {
+ if (regex_match && update) {
try {
- run_volk_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(),
- test_case.test_parameters(), &results, test_case.puppet_master_name());
- }
- catch (std::string &error) {
- std::cerr << "Caught Exception in 'run_volk_tests': " << error << std::endl;
+ run_volk_tests(test_case.desc(),
+ test_case.kernel_ptr(),
+ test_case.name(),
+ test_case.test_parameters(),
+ &results,
+ test_case.puppet_master_name());
+ } catch (std::string& error) {
+ std::cerr << "Caught Exception in 'run_volk_tests': " << error
+ << std::endl;
}
}
}
// Output results according to provided options
- if(json_filename != "") {
+ if (json_filename != "") {
write_json(json_file, results);
json_file.close();
}
- if(!dry_run) {
- if(config_file != "") write_results(&results, false, config_file);
- else write_results(&results, false);
- }
- else {
+ if (!dry_run) {
+ if (config_file != "")
+ write_results(&results, false, config_file);
+ else
+ write_results(&results, false);
+ } else {
std::cout << "Warning: this was a dry-run. Config not generated" << std::endl;
}
return 0;
}
-void read_results(std::vector<volk_test_results_t> *results)
+void read_results(std::vector<volk_test_results_t>* results)
{
char path[1024];
volk_get_config_path(path, true);
- if(path[0] == 0){
+ if (path[0] == 0) {
std::cout << "No prior test results found ..." << std::endl;
return;
}
read_results(results, std::string(path));
}
-void read_results(std::vector<volk_test_results_t> *results, std::string path)
+void read_results(std::vector<volk_test_results_t>* results, std::string path)
{
struct stat buffer;
- bool config_status = (stat (path.c_str(), &buffer) == 0);
+ bool config_status = (stat(path.c_str(), &buffer) == 0);
- if( config_status ) {
+ if (config_status) {
// a config exists and we are reading results from it
std::ifstream config(path.c_str());
char config_line[256];
- while(config.getline(config_line, 255)) {
+ while (config.getline(config_line, 255)) {
// tokenize the input line by kernel_name unaligned aligned
// then push back in the results vector with fields filled in
std::string config_str(config_line);
std::size_t str_size = config_str.size();
std::size_t found = config_str.find(' ');
-
+
// Split line by spaces
- while(found && found < str_size) {
+ while (found && found < str_size) {
found = config_str.find(' ');
// kernel names MUST be less than 128 chars, which is
// a length restricted by volk/volk_prefs.c
// on the last token in the parsed string we won't find a space
// so make sure we copy at most 128 chars.
- if(found > 127) {
+ if (found > 127) {
found = 127;
}
str_size = config_str.size();
- char buffer[128] = {'\0'};
+ char buffer[128] = { '\0' };
config_str.copy(buffer, found + 1, 0);
buffer[found] = '\0';
single_kernel_result.push_back(std::string(buffer));
- config_str.erase(0, found+1);
+ config_str.erase(0, found + 1);
}
- if(single_kernel_result.size() == 3) {
+ if (single_kernel_result.size() == 3) {
volk_test_results_t kernel_result;
kernel_result.name = std::string(single_kernel_result[0]);
kernel_result.config_name = std::string(single_kernel_result[0]);
}
}
-void write_results(const std::vector<volk_test_results_t> *results, bool update_result)
+void write_results(const std::vector<volk_test_results_t>* results, bool update_result)
{
char path[1024];
volk_get_config_path(path, false);
- if(path[0] == 0){
+ if (path[0] == 0) {
std::cout << "Aborting 'No config save path found' ..." << std::endl;
return;
}
- write_results( results, update_result, std::string(path));
+ write_results(results, update_result, std::string(path));
}
-void write_results(const std::vector<volk_test_results_t> *results, bool update_result, const std::string path)
+void write_results(const std::vector<volk_test_results_t>* results,
+ bool update_result,
+ const std::string path)
{
-// struct stat buffer;
-// bool config_status = (stat (path.c_str(), &buffer) == 0);
+ // struct stat buffer;
+ // bool config_status = (stat (path.c_str(), &buffer) == 0);
/*
* These
*/
const fs::path config_path(path);
- if (! fs::exists(config_path.parent_path()))
- {
+ if (!fs::exists(config_path.parent_path())) {
std::cout << "Creating " << config_path.parent_path() << "..." << std::endl;
fs::create_directories(config_path.parent_path());
}
std::ofstream config;
- if(update_result) {
+ if (update_result) {
std::cout << "Updating " << path << "..." << std::endl;
config.open(path.c_str(), std::ofstream::app);
- if (!config.is_open()) { //either we don't have write access or we don't have the dir yet
+ if (!config.is_open()) { // either we don't have write access or we don't have the
+ // dir yet
std::cout << "Error opening file " << path << std::endl;
}
- }
- else {
+ } else {
std::cout << "Writing " << path << "..." << std::endl;
config.open(path.c_str());
- if (!config.is_open()) { //either we don't have write access or we don't have the dir yet
+ if (!config.is_open()) { // either we don't have write access or we don't have the
+ // dir yet
std::cout << "Error opening file " << path << std::endl;
}
}
std::vector<volk_test_results_t>::const_iterator profile_results;
- for(profile_results = results->begin(); profile_results != results->end(); ++profile_results) {
- config << profile_results->config_name << " "
- << profile_results->best_arch_a << " "
- << profile_results->best_arch_u << std::endl;
+ for (profile_results = results->begin(); profile_results != results->end();
+ ++profile_results) {
+ config << profile_results->config_name << " " << profile_results->best_arch_a
+ << " " << profile_results->best_arch_u << std::endl;
}
config.close();
}
-void write_json(std::ofstream &json_file, std::vector<volk_test_results_t> results)
+void write_json(std::ofstream& json_file, std::vector<volk_test_results_t> results)
{
json_file << "{" << std::endl;
json_file << " \"volk_tests\": [" << std::endl;
size_t len = results.size();
size_t i = 0;
std::vector<volk_test_results_t>::iterator result;
- for(result = results.begin(); result != results.end(); ++result) {
+ for (result = results.begin(); result != results.end(); ++result) {
json_file << " {" << std::endl;
json_file << " \"name\": \"" << result->name << "\"," << std::endl;
json_file << " \"vlen\": " << (int)(result->vlen) << "," << std::endl;
json_file << " \"iter\": " << result->iter << "," << std::endl;
- json_file << " \"best_arch_a\": \"" << result->best_arch_a
- << "\"," << std::endl;
- json_file << " \"best_arch_u\": \"" << result->best_arch_u
- << "\"," << std::endl;
+ json_file << " \"best_arch_a\": \"" << result->best_arch_a << "\","
+ << std::endl;
+ json_file << " \"best_arch_u\": \"" << result->best_arch_u << "\","
+ << std::endl;
json_file << " \"results\": {" << std::endl;
size_t results_len = result->results.size();
size_t ri = 0;
std::map<std::string, volk_test_time_t>::iterator kernel_time_pair;
- for(kernel_time_pair = result->results.begin(); kernel_time_pair != result->results.end(); ++kernel_time_pair) {
+ for (kernel_time_pair = result->results.begin();
+ kernel_time_pair != result->results.end();
+ ++kernel_time_pair) {
volk_test_time_t time = kernel_time_pair->second;
json_file << " \"" << time.name << "\": {" << std::endl;
json_file << " \"name\": \"" << time.name << "\"," << std::endl;
json_file << " \"time\": " << time.time << "," << std::endl;
json_file << " \"units\": \"" << time.units << "\"" << std::endl;
- json_file << " }" ;
- if(ri+1 != results_len) {
+ json_file << " }";
+ if (ri + 1 != results_len) {
json_file << ",";
}
json_file << std::endl;
}
json_file << " }" << std::endl;
json_file << " }";
- if(i+1 != len) {
+ if (i + 1 != len) {
json_file << ",";
}
json_file << std::endl;
-#include <stdbool.h> // for bool
-#include <iosfwd> // for ofstream
-#include <string> // for string
-#include <vector> // for vector
+#include <stdbool.h> // for bool
+#include <iosfwd> // for ofstream
+#include <string> // for string
+#include <vector> // for vector
class volk_test_results_t;
-void read_results(std::vector<volk_test_results_t> *results);
-void read_results(std::vector<volk_test_results_t> *results, std::string path);
-void write_results(const std::vector<volk_test_results_t> *results, bool update_result);
-void write_results(const std::vector<volk_test_results_t> *results, bool update_result, const std::string path);
-void write_json(std::ofstream &json_file, std::vector<volk_test_results_t> results);
+void read_results(std::vector<volk_test_results_t>* results);
+void read_results(std::vector<volk_test_results_t>* results, std::string path);
+void write_results(const std::vector<volk_test_results_t>* results, bool update_result);
+void write_results(const std::vector<volk_test_results_t>* results,
+ bool update_result,
+ const std::string path);
+void write_json(std::ofstream& json_file, std::vector<volk_test_results_t> results);
// enable inline functions for C code
////////////////////////////////////////////////////////////////////////
#ifndef __cplusplus
-# define inline __inline
+#define inline __inline
#endif
////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////
#if _MSC_VER < 1800
#include <math.h>
-static inline long lrint(double x){return (long)(x > 0.0 ? x + 0.5 : x - 0.5);}
-static inline long lrintf(float x){return (long)(x > 0.0f ? x + 0.5f : x - 0.5f);}
-static inline long long llrint(double x){return (long long)(x > 0.0 ? x + 0.5 : x - 0.5);}
-static inline long long llrintf(float x){return (long long)(x > 0.0f ? x + 0.5f : x - 0.5f);}
-static inline double rint(double x){return (x > 0.0)? floor(x + 0.5) : ceil(x - 0.5);}
-static inline float rintf(float x){return (x > 0.0f)? floorf(x + 0.5f) : ceilf(x - 0.5f);}
+static inline long lrint(double x) { return (long)(x > 0.0 ? x + 0.5 : x - 0.5); }
+static inline long lrintf(float x) { return (long)(x > 0.0f ? x + 0.5f : x - 0.5f); }
+static inline long long llrint(double x)
+{
+ return (long long)(x > 0.0 ? x + 0.5 : x - 0.5);
+}
+static inline long long llrintf(float x)
+{
+ return (long long)(x > 0.0f ? x + 0.5f : x - 0.5f);
+}
+static inline double rint(double x) { return (x > 0.0) ? floor(x + 0.5) : ceil(x - 0.5); }
+static inline float rintf(float x)
+{
+ return (x > 0.0f) ? floorf(x + 0.5f) : ceilf(x - 0.5f);
+}
#endif
////////////////////////////////////////////////////////////////////////
// random and srandom
////////////////////////////////////////////////////////////////////////
#include <stdlib.h>
-static inline long int random (void) { return rand(); }
-static inline void srandom (unsigned int seed) { srand(seed); }
+static inline long int random(void) { return rand(); }
+static inline void srandom(unsigned int seed) { srand(seed); }
#endif // _MSC_CONFIG_H_ ]
#define NOMINMAX
#endif
-//http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668
+// http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668
#include < time.h >
#include <windows.h> //I've omitted this line.
#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS)
- #define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64
#else
- #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
#if _MSC_VER < 1900
struct timespec {
-time_t tv_sec; /* Seconds since 00:00:00 GMT, */
+ time_t tv_sec; /* Seconds since 00:00:00 GMT, */
-/* 1 January 1970 */
+ /* 1 January 1970 */
-long tv_nsec; /* Additional nanoseconds since */
-
-/* tv_sec */
+ long tv_nsec; /* Additional nanoseconds since */
+ /* tv_sec */
};
#endif
-struct timezone
-{
- int tz_minuteswest; /* minutes W of Greenwich */
- int tz_dsttime; /* type of dst correction */
+struct timezone {
+ int tz_minuteswest; /* minutes W of Greenwich */
+ int tz_dsttime; /* type of dst correction */
};
-static inline int gettimeofday(struct timeval *tv, struct timezone *tz)
+static inline int gettimeofday(struct timeval* tv, struct timezone* tz)
{
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- if (NULL != tz)
- {
- if (!tzflag)
- {
- _tzset();
- tzflag++;
+ FILETIME ft;
+ unsigned __int64 tmpres = 0;
+ static int tzflag;
+
+ if (NULL != tv) {
+ GetSystemTimeAsFileTime(&ft);
+
+ tmpres |= ft.dwHighDateTime;
+ tmpres <<= 32;
+ tmpres |= ft.dwLowDateTime;
+
+ /*converting file time to unix epoch*/
+ tmpres -= DELTA_EPOCH_IN_MICROSECS;
+ tv->tv_sec = (long)(tmpres / 1000000UL);
+ tv->tv_usec = (long)(tmpres % 1000000UL);
+ }
+
+ if (NULL != tz) {
+ if (!tzflag) {
+ _tzset();
+ tzflag++;
+ }
+ tz->tz_minuteswest = _timezone / 60;
+ tz->tz_dsttime = _daylight;
}
- tz->tz_minuteswest = _timezone / 60;
- tz->tz_dsttime = _daylight;
- }
- return 0;
+ return 0;
}
#endif //_MSC_SYS_TIME_H_
static inline int16_t sat_adds16i(int16_t x, int16_t y)
{
- int32_t res = (int32_t) x + (int32_t) y;
+ int32_t res = (int32_t)x + (int32_t)y;
- if (res < SHRT_MIN) res = SHRT_MIN;
- if (res > SHRT_MAX) res = SHRT_MAX;
+ if (res < SHRT_MIN)
+ res = SHRT_MIN;
+ if (res > SHRT_MAX)
+ res = SHRT_MAX;
return res;
}
static inline int16_t sat_muls16i(int16_t x, int16_t y)
{
- int32_t res = (int32_t) x * (int32_t) y;
+ int32_t res = (int32_t)x * (int32_t)y;
- if (res < SHRT_MIN) res = SHRT_MIN;
- if (res > SHRT_MAX) res = SHRT_MAX;
+ if (res < SHRT_MIN)
+ res = SHRT_MIN;
+ if (res > SHRT_MAX)
+ res = SHRT_MAX;
return res;
}
*/
template <class T>
struct alloc {
- typedef T value_type;
+ typedef T value_type;
- alloc() = default;
+ alloc() = default;
- template <class U> constexpr alloc(alloc<U> const&) noexcept {}
+ template <class U>
+ constexpr alloc(alloc<U> const&) noexcept
+ {
+ }
- T* allocate(std::size_t n) {
- if (n > std::numeric_limits<std::size_t>::max() / sizeof(T)) throw std::bad_alloc();
+ T* allocate(std::size_t n)
+ {
+ if (n > std::numeric_limits<std::size_t>::max() / sizeof(T))
+ throw std::bad_alloc();
- if (auto p = static_cast<T*>(volk_malloc(n*sizeof(T), volk_get_alignment())))
- return p;
+ if (auto p = static_cast<T*>(volk_malloc(n * sizeof(T), volk_get_alignment())))
+ return p;
- throw std::bad_alloc();
- }
+ throw std::bad_alloc();
+ }
- void deallocate(T* p, std::size_t) noexcept { volk_free(p); }
-
-} ;
+ void deallocate(T* p, std::size_t) noexcept { volk_free(p); }
+};
template <class T, class U>
-bool operator==(alloc<T> const&, alloc<U> const&) { return true; }
+bool operator==(alloc<T> const&, alloc<U> const&)
+{
+ return true;
+}
template <class T, class U>
-bool operator!=(alloc<T> const&, alloc<U> const&) { return false; }
+bool operator!=(alloc<T> const&, alloc<U> const&)
+{
+ return false;
+}
/*!
* example code:
* volk::vector<float> v(100); // vector using volk_malloc, volk_free
*/
-template<class T>
-using vector = std::vector<T, alloc<T> >;
+template <class T>
+using vector = std::vector<T, alloc<T>>;
} // namespace volk
#endif // INCLUDED_VOLK_ALLOC_H
/* -*- c++ -*- */
-/*
+/*
* Copyright 2015 Free Software Foundation, Inc.
- *
+ *
* This file is part of GNU Radio
- *
+ *
* GNU Radio is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3, or (at your option)
* any later version.
- *
+ *
* GNU Radio is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
- *
+ *
* You should have received a copy of the GNU General Public License
* along with GNU Radio; see the file COPYING. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street,
#ifndef INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
#define INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
-#include <immintrin.h>
#include "volk/volk_avx_intrinsics.h"
+#include <immintrin.h>
-static inline __m256
-_mm256_polar_sign_mask_avx2(__m128i fbits){
- const __m128i zeros = _mm_set1_epi8(0x00);
- const __m128i sign_extract = _mm_set1_epi8(0x80);
- const __m256i shuffle_mask = _mm256_setr_epi8(0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x01, 0xff, 0xff, 0xff, 0x02, 0xff, 0xff, 0xff, 0x03,
- 0xff, 0xff, 0xff, 0x04, 0xff, 0xff, 0xff, 0x05, 0xff, 0xff, 0xff, 0x06, 0xff, 0xff, 0xff, 0x07);
- __m256i sign_bits = _mm256_setzero_si256();
-
- fbits = _mm_cmpgt_epi8(fbits, zeros);
- fbits = _mm_and_si128(fbits, sign_extract);
- sign_bits = _mm256_insertf128_si256(sign_bits,fbits,0);
- sign_bits = _mm256_insertf128_si256(sign_bits,fbits,1);
- sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask);
+static inline __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
+{
+ const __m128i zeros = _mm_set1_epi8(0x00);
+ const __m128i sign_extract = _mm_set1_epi8(0x80);
+ const __m256i shuffle_mask = _mm256_setr_epi8(0xff,
+ 0xff,
+ 0xff,
+ 0x00,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0x01,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0x02,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0x03,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0x04,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0x05,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0x06,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0x07);
+ __m256i sign_bits = _mm256_setzero_si256();
- return _mm256_castsi256_ps(sign_bits);
+ fbits = _mm_cmpgt_epi8(fbits, zeros);
+ fbits = _mm_and_si128(fbits, sign_extract);
+ sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 0);
+ sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 1);
+ sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask);
+
+ return _mm256_castsi256_ps(sign_bits);
}
static inline __m256
-_mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits){
+_mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
+{
// prepare sign mask for correct +-
__m256 sign_mask = _mm256_polar_sign_mask_avx2(fbits);
return dst;
}
-static inline __m256
-_mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0, const __m256 cplxValue1){
- const __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
- const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values
- const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values
- const __m256 complex_result = _mm256_hadd_ps(squared0, squared1);
- return _mm256_permutevar8x32_ps(complex_result, idx);
+static inline __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0,
+ const __m256 cplxValue1)
+{
+ const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+ const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values
+ const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values
+ const __m256 complex_result = _mm256_hadd_ps(squared0, squared1);
+ return _mm256_permutevar8x32_ps(complex_result, idx);
}
-static inline __m256
-_mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar){
- /*
- * Calculate: |y - x|^2 * SNR_lin
- * Consider 'symbolsX' and 'pointsX' to be complex float
- * 'symbolsX' are 'y' and 'pointsX' are 'x'
- */
- const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
- const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
- const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1);
- return _mm256_mul_ps(norms, scalar);
+static inline __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0,
+ const __m256 symbols1,
+ const __m256 points0,
+ const __m256 points1,
+ const __m256 scalar)
+{
+ /*
+ * Calculate: |y - x|^2 * SNR_lin
+ * Consider 'symbolsX' and 'pointsX' to be complex float
+ * 'symbolsX' are 'y' and 'pointsX' are 'x'
+ */
+ const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
+ const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
+ const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1);
+ return _mm256_mul_ps(norms, scalar);
}
#endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */
/* -*- c++ -*- */
-/*
+/*
* Copyright 2015 Free Software Foundation, Inc.
- *
+ *
* This file is part of GNU Radio
- *
+ *
* GNU Radio is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3, or (at your option)
* any later version.
- *
+ *
* GNU Radio is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
- *
+ *
* You should have received a copy of the GNU General Public License
* along with GNU Radio; see the file COPYING. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street,
#define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
#include <immintrin.h>
-static inline __m256
-_mm256_complexmul_ps(__m256 x, __m256 y)
+static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
{
- __m256 yl, yh, tmp1, tmp2;
- yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
- yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
- tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
- x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ...
- tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
- return _mm256_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ __m256 yl, yh, tmp1, tmp2;
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
+ tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ...
+ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+ return _mm256_addsub_ps(tmp1,
+ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
}
-static inline __m256
-_mm256_conjugate_ps(__m256 x){
- const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
- return _mm256_xor_ps(x, conjugator); // conjugate y
+static inline __m256 _mm256_conjugate_ps(__m256 x)
+{
+ const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+ return _mm256_xor_ps(x, conjugator); // conjugate y
}
-static inline __m256
-_mm256_complexconjugatemul_ps(__m256 x, __m256 y){
- y = _mm256_conjugate_ps(y);
- return _mm256_complexmul_ps(x, y);
+static inline __m256 _mm256_complexconjugatemul_ps(__m256 x, __m256 y)
+{
+ y = _mm256_conjugate_ps(y);
+ return _mm256_complexmul_ps(x, y);
}
-static inline __m256
-_mm256_normalize_ps(__m256 val)
+static inline __m256 _mm256_normalize_ps(__m256 val)
{
- __m256 tmp1 = _mm256_mul_ps(val, val);
- tmp1 = _mm256_hadd_ps(tmp1, tmp1);
- tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8
- tmp1 = _mm256_sqrt_ps(tmp1);
- return _mm256_div_ps(val, tmp1);
+ __m256 tmp1 = _mm256_mul_ps(val, val);
+ tmp1 = _mm256_hadd_ps(tmp1, tmp1);
+ tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8
+ tmp1 = _mm256_sqrt_ps(tmp1);
+ return _mm256_div_ps(val, tmp1);
}
-static inline __m256
-_mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2){
- __m256 complex1, complex2;
- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
- complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
- complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
- return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
+static inline __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
+{
+ __m256 complex1, complex2;
+ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+ return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
}
-static inline __m256
-_mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2){
- return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
+static inline __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
+{
+ return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
}
-static inline __m256
-_mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar){
- /*
- * Calculate: |y - x|^2 * SNR_lin
- * Consider 'symbolsX' and 'pointsX' to be complex float
- * 'symbolsX' are 'y' and 'pointsX' are 'x'
- */
- const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
- const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
- const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1);
- return _mm256_mul_ps(norms, scalar);
+static inline __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0,
+ const __m256 symbols1,
+ const __m256 points0,
+ const __m256 points1,
+ const __m256 scalar)
+{
+ /*
+ * Calculate: |y - x|^2 * SNR_lin
+ * Consider 'symbolsX' and 'pointsX' to be complex float
+ * 'symbolsX' are 'y' and 'pointsX' are 'x'
+ */
+ const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
+ const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
+ const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1);
+ return _mm256_mul_ps(norms, scalar);
}
-static inline __m256
-_mm256_polar_sign_mask(__m128i fbits){
- __m256 sign_mask_dummy = _mm256_setzero_ps();
- const __m128i zeros = _mm_set1_epi8(0x00);
- const __m128i sign_extract = _mm_set1_epi8(0x80);
- const __m128i shuffle_mask0 = _mm_setr_epi8(0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x01, 0xff, 0xff, 0xff, 0x02, 0xff, 0xff, 0xff, 0x03);
- const __m128i shuffle_mask1 = _mm_setr_epi8(0xff, 0xff, 0xff, 0x04, 0xff, 0xff, 0xff, 0x05, 0xff, 0xff, 0xff, 0x06, 0xff, 0xff, 0xff, 0x07);
-
- fbits = _mm_cmpgt_epi8(fbits, zeros);
- fbits = _mm_and_si128(fbits, sign_extract);
- __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0);
- __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1);
-
- __m256 sign_mask = _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0);
- return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1);
-// // This is the desired function call. Though it seems to be missing in GCC.
-// // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
-// return _mm256_set_m128(_mm_castsi128_ps(sign_bits1), _mm_castsi128_ps(sign_bits0));
+static inline __m256 _mm256_polar_sign_mask(__m128i fbits)
+{
+ __m256 sign_mask_dummy = _mm256_setzero_ps();
+ const __m128i zeros = _mm_set1_epi8(0x00);
+ const __m128i sign_extract = _mm_set1_epi8(0x80);
+ const __m128i shuffle_mask0 = _mm_setr_epi8(0xff,
+ 0xff,
+ 0xff,
+ 0x00,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0x01,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0x02,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0x03);
+ const __m128i shuffle_mask1 = _mm_setr_epi8(0xff,
+ 0xff,
+ 0xff,
+ 0x04,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0x05,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0x06,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0x07);
+
+ fbits = _mm_cmpgt_epi8(fbits, zeros);
+ fbits = _mm_and_si128(fbits, sign_extract);
+ __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0);
+ __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1);
+
+ __m256 sign_mask =
+ _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0);
+ return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1);
+ // // This is the desired function call. Though it seems to be missing in GCC.
+ // // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
+ // return _mm256_set_m128(_mm_castsi128_ps(sign_bits1),
+ // _mm_castsi128_ps(sign_bits0));
}
static inline void
-_mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1){
+_mm256_polar_deinterleave(__m256* llr0, __m256* llr1, __m256 src0, __m256 src1)
+{
// deinterleave values
__m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
__m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
*llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
}
-static inline __m256
-_mm256_polar_minsum_llrs(__m256 src0, __m256 src1){
+static inline __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
+{
const __m256 sign_mask = _mm256_set1_ps(-0.0f);
- const __m256 abs_mask = _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
+ const __m256 abs_mask =
+ _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
__m256 llr0, llr1;
_mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
// calculate result
- __m256 sign = _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
- __m256 dst = _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
+ __m256 sign =
+ _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
+ __m256 dst =
+ _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
return _mm256_or_ps(dst, sign);
}
-static inline __m256
-_mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits){
+static inline __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
+{
// prepare sign mask for correct +-
__m256 sign_mask = _mm256_polar_sign_mask(fbits);
// AppleClang also defines __GNUC__, so do this check first. These
// will probably be the same as for __GNUC__, but let's keep them
// separate just to be safe.
-# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
-# define __VOLK_ATTR_UNUSED __attribute__((unused))
-# define __VOLK_ATTR_INLINE __attribute__((always_inline))
-# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
-# define __VOLK_ASM __asm__
-# define __VOLK_VOLATILE __volatile__
-# define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
-# define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
-# define __VOLK_PREFETCH(addr) __builtin_prefetch(addr)
-#elif defined(__GNUC__)
-# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
-# define __VOLK_ATTR_UNUSED __attribute__((unused))
-# define __VOLK_ATTR_INLINE __attribute__((always_inline))
-# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
-# define __VOLK_ASM __asm__
-# define __VOLK_VOLATILE __volatile__
-# if __GNUC__ >= 4
-# define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
-# define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
-# else
-# define __VOLK_ATTR_EXPORT
-# define __VOLK_ATTR_IMPORT
-# endif
-# define __VOLK_PREFETCH(addr) __builtin_prefetch(addr)
+#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
+#define __VOLK_ATTR_UNUSED __attribute__((unused))
+#define __VOLK_ATTR_INLINE __attribute__((always_inline))
+#define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
+#define __VOLK_ASM __asm__
+#define __VOLK_VOLATILE __volatile__
+#define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
+#define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
+#define __VOLK_PREFETCH(addr) __builtin_prefetch(addr)
+#elif defined __GNUC__
+#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
+#define __VOLK_ATTR_UNUSED __attribute__((unused))
+#define __VOLK_ATTR_INLINE __attribute__((always_inline))
+#define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
+#define __VOLK_ASM __asm__
+#define __VOLK_VOLATILE __volatile__
+#if __GNUC__ >= 4
+#define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
+#define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
#else
-# warning "Unknown compiler. Using default VOLK macros, which may or not work."
-# define __VOLK_ATTR_ALIGNED(x)
-# define __VOLK_ATTR_UNUSED
-# define __VOLK_ATTR_INLINE
-# define __VOLK_ATTR_DEPRECATED
-# define __VOLK_ATTR_EXPORT
-# define __VOLK_ATTR_IMPORT
-# define __VOLK_PREFETCH(addr)
-# define __VOLK_ASM __asm__
-# define __VOLK_VOLATILE __volatile__
+#define __VOLK_ATTR_EXPORT
+#define __VOLK_ATTR_IMPORT
+#endif
+#define __VOLK_PREFETCH(addr) __builtin_prefetch(addr)
+#elif _MSC_VER
+#define __VOLK_ATTR_ALIGNED(x) __declspec(align(x))
+#define __VOLK_ATTR_UNUSED
+#define __VOLK_ATTR_INLINE __forceinline
+#define __VOLK_ATTR_DEPRECATED __declspec(deprecated)
+#define __VOLK_ATTR_EXPORT __declspec(dllexport)
+#define __VOLK_ATTR_IMPORT __declspec(dllimport)
+#define __VOLK_PREFETCH(addr)
+#define __VOLK_ASM __asm
+#define __VOLK_VOLATILE
+#else
+#define __VOLK_ATTR_ALIGNED(x)
+#define __VOLK_ATTR_UNUSED
+#define __VOLK_ATTR_INLINE
+#define __VOLK_ATTR_DEPRECATED
+#define __VOLK_ATTR_EXPORT
+#define __VOLK_ATTR_IMPORT
+#define __VOLK_PREFETCH(addr)
+#define __VOLK_ASM __asm__
+#define __VOLK_VOLATILE __volatile__
#endif
////////////////////////////////////////////////////////////////////////
// Ignore annoying warnings in MSVC
////////////////////////////////////////////////////////////////////////
#if defined(_MSC_VER)
-# pragma warning(disable: 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data
-# pragma warning(disable: 4305) //'identifier' : truncation from 'type1' to 'type2'
+#pragma warning(disable : 4244) //'conversion' conversion from 'type1' to 'type2',
+ //possible loss of data
+#pragma warning(disable : 4305) //'identifier' : truncation from 'type1' to 'type2'
#endif
////////////////////////////////////////////////////////////////////////
// C-linkage declaration macros
// FIXME: due to the usage of complex.h, require gcc for c-linkage
////////////////////////////////////////////////////////////////////////
-#if defined(__cplusplus) && (defined(__GNUC__) || defined(__clang__))
-# define __VOLK_DECL_BEGIN extern "C" {
-# define __VOLK_DECL_END }
+#if defined(__cplusplus) && (__GNUC__)
+#define __VOLK_DECL_BEGIN extern "C" {
+#define __VOLK_DECL_END }
#else
-# define __VOLK_DECL_BEGIN
-# define __VOLK_DECL_END
+#define __VOLK_DECL_BEGIN
+#define __VOLK_DECL_END
#endif
////////////////////////////////////////////////////////////////////////
// http://gcc.gnu.org/wiki/Visibility
////////////////////////////////////////////////////////////////////////
#ifdef volk_EXPORTS
-# define VOLK_API __VOLK_ATTR_EXPORT
+#define VOLK_API __VOLK_ATTR_EXPORT
#else
-# define VOLK_API __VOLK_ATTR_IMPORT
+#define VOLK_API __VOLK_ATTR_IMPORT
#endif
////////////////////////////////////////////////////////////////////////
#endif
#endif
-union bit128{
- uint8_t i8[16];
- uint16_t i16[8];
- uint32_t i[4];
- float f[4];
- double d[2];
+union bit128 {
+ uint8_t i8[16];
+ uint16_t i16[8];
+ uint32_t i[4];
+ float f[4];
+ double d[2];
- #ifdef LV_HAVE_SSE
- __m128 float_vec;
- #endif
+#ifdef LV_HAVE_SSE
+ __m128 float_vec;
+#endif
- #ifdef LV_HAVE_SSE2
- __m128i int_vec;
- __m128d double_vec;
- #endif
+#ifdef LV_HAVE_SSE2
+ __m128i int_vec;
+ __m128d double_vec;
+#endif
};
-union bit256{
- uint8_t i8[32];
- uint16_t i16[16];
- uint32_t i[8];
- float f[8];
- double d[4];
+union bit256 {
+ uint8_t i8[32];
+ uint16_t i16[16];
+ uint32_t i[8];
+ float f[8];
+ double d[4];
- #ifdef LV_HAVE_AVX
- __m256 float_vec;
- __m256i int_vec;
- __m256d double_vec;
- #endif
+#ifdef LV_HAVE_AVX
+ __m256 float_vec;
+ __m256i int_vec;
+ __m256d double_vec;
+#endif
};
-#define bit128_p(x) ((union bit128 *)(x))
-#define bit256_p(x) ((union bit256 *)(x))
+#define bit128_p(x) ((union bit128*)(x))
+#define bit256_p(x) ((union bit256*)(x))
#endif /*INCLUDED_LIBVOLK_COMMON_H*/
#ifdef __cplusplus
-#include <complex>
#include <stdint.h>
+#include <complex>
-typedef std::complex<int8_t> lv_8sc_t;
+typedef std::complex<int8_t> lv_8sc_t;
typedef std::complex<int16_t> lv_16sc_t;
typedef std::complex<int32_t> lv_32sc_t;
typedef std::complex<int64_t> lv_64sc_t;
-typedef std::complex<float> lv_32fc_t;
-typedef std::complex<double> lv_64fc_t;
+typedef std::complex<float> lv_32fc_t;
+typedef std::complex<double> lv_64fc_t;
-template <typename T> inline std::complex<T> lv_cmake(const T &r, const T &i){
+template <typename T>
+inline std::complex<T> lv_cmake(const T& r, const T& i)
+{
return std::complex<T>(r, i);
}
-template <typename T> inline typename T::value_type lv_creal(const T &x){
+template <typename T>
+inline typename T::value_type lv_creal(const T& x)
+{
return x.real();
}
-template <typename T> inline typename T::value_type lv_cimag(const T &x){
+template <typename T>
+inline typename T::value_type lv_cimag(const T& x)
+{
return x.imag();
}
-template <typename T> inline T lv_conj(const T &x){
+template <typename T>
+inline T lv_conj(const T& x)
+{
return std::conj(x);
}
#else /* __cplusplus */
#if __STDC_VERSION__ >= 199901L /* C99 check */
-/* this allows us to conj in lv_conj without the double detour for single-precision floats */
+/* this allows us to conj in lv_conj without the double detour for single-precision floats
+ */
#include <tgmath.h>
#endif /* C99 check */
#include <complex.h>
-typedef char complex lv_8sc_t;
-typedef short complex lv_16sc_t;
-typedef long complex lv_32sc_t;
-typedef long long complex lv_64sc_t;
-typedef float complex lv_32fc_t;
-typedef double complex lv_64fc_t;
+typedef char complex lv_8sc_t;
+typedef short complex lv_16sc_t;
+typedef long complex lv_32sc_t;
+typedef long long complex lv_64sc_t;
+typedef float complex lv_32fc_t;
+typedef double complex lv_64fc_t;
-#define lv_cmake(r, i) ((r) + _Complex_I*(i))
+#define lv_cmake(r, i) ((r) + _Complex_I * (i))
// When GNUC is available, use the complex extensions.
// The extensions always return the correct value type.
#ifndef INCLUDED_VOLK_MALLOC_H
#define INCLUDED_VOLK_MALLOC_H
-#include <volk/volk_common.h>
#include <stdlib.h>
+#include <volk/volk_common.h>
__VOLK_DECL_BEGIN
* For Apple Clang, we fall back to `posix_memalign`.
* see: https://linux.die.net/man/3/aligned_alloc
* For MSVC, we fall back to `_aligned_malloc`.
- * see: https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-malloc?view=vs-2019
+ * see:
+ * https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-malloc?view=vs-2019
*
* Because of the ways in which volk_malloc may allocate memory, it is
* important to always free volk_malloc pointers using volk_free.
* \param alignment The byte alignment of the allocated memory.
* \return pointer to aligned memory.
*/
-VOLK_API void *volk_malloc(size_t size, size_t alignment);
+VOLK_API void* volk_malloc(size_t size, size_t alignment);
/*!
* \brief Free's memory allocated by volk_malloc.
* Thus, in this case `volk_free` inherits the same behavior `free` exhibits.
* see: https://en.cppreference.com/w/c/memory/free
* In case `_aligned_malloc` was used, we call `_aligned_free`.
- * see: https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-free?view=vs-2019
+ * see:
+ * https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-free?view=vs-2019
*
* \param aptr The aligned pointer allocated by volk_malloc.
*/
-VOLK_API void volk_free(void *aptr);
+VOLK_API void volk_free(void* aptr);
__VOLK_DECL_END
3. This notice may not be removed or altered from any source distribution.
(this is the zlib license)
-
+
_vsincosq_f32
-
+
*/
/*
/* Magnitude squared for float32x4x2_t */
-static inline float32x4_t
-_vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
+static inline float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
{
float32x4_t iValue, qValue, result;
iValue = vmulq_f32(cmplxValue.val[0], cmplxValue.val[0]); // Square the values
qValue = vmulq_f32(cmplxValue.val[1], cmplxValue.val[1]); // Square the values
- result = vaddq_f32(iValue, qValue); // Add the I2 and Q2 values
+ result = vaddq_f32(iValue, qValue); // Add the I2 and Q2 values
return result;
}
static inline float32x4_t _vinvsqrtq_f32(float32x4_t x)
{
float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
- sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
- sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-
+ sqrt_reciprocal = vmulq_f32(
+ vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+ sqrt_reciprocal = vmulq_f32(
+ vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+
return sqrt_reciprocal;
}
{
// Newton's method
float32x4_t recip = vrecpeq_f32(x);
- recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
- recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
+ recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
+ recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
return recip;
}
/* Complex multiplication for float32x4x2_t */
-static inline float32x4x2_t
-_vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
+static inline float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val,
+ float32x4x2_t b_val)
{
float32x4x2_t tmp_real;
float32x4x2_t tmp_imag;
float32x4x2_t c_val;
-
+
// multiply the real*real and imag*imag to get real result
// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
/* From ARM Compute Library, MIT license */
static inline float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t coeffs[8])
{
- float32x4_t cA = vmlaq_f32(coeffs[0], coeffs[4], x);
- float32x4_t cB = vmlaq_f32(coeffs[2], coeffs[6], x);
- float32x4_t cC = vmlaq_f32(coeffs[1], coeffs[5], x);
- float32x4_t cD = vmlaq_f32(coeffs[3], coeffs[7], x);
- float32x4_t x2 = vmulq_f32(x, x);
- float32x4_t x4 = vmulq_f32(x2, x2);
+ float32x4_t cA = vmlaq_f32(coeffs[0], coeffs[4], x);
+ float32x4_t cB = vmlaq_f32(coeffs[2], coeffs[6], x);
+ float32x4_t cC = vmlaq_f32(coeffs[1], coeffs[5], x);
+ float32x4_t cD = vmlaq_f32(coeffs[3], coeffs[7], x);
+ float32x4_t x2 = vmulq_f32(x, x);
+ float32x4_t x4 = vmulq_f32(x2, x2);
float32x4_t res = vmlaq_f32(vmlaq_f32(cA, cB, x2), vmlaq_f32(cC, cD, x2), x4);
return res;
}
static inline float32x4_t _vlogq_f32(float32x4_t x)
{
const float32x4_t log_tab[8] = {
- vdupq_n_f32(-2.29561495781f),
- vdupq_n_f32(-2.47071170807f),
- vdupq_n_f32(-5.68692588806f),
- vdupq_n_f32(-0.165253549814f),
- vdupq_n_f32(5.17591238022f),
- vdupq_n_f32(0.844007015228f),
- vdupq_n_f32(4.58445882797f),
- vdupq_n_f32(0.0141278216615f),
+ vdupq_n_f32(-2.29561495781f), vdupq_n_f32(-2.47071170807f),
+ vdupq_n_f32(-5.68692588806f), vdupq_n_f32(-0.165253549814f),
+ vdupq_n_f32(5.17591238022f), vdupq_n_f32(0.844007015228f),
+ vdupq_n_f32(4.58445882797f), vdupq_n_f32(0.0141278216615f),
};
-
- const int32x4_t CONST_127 = vdupq_n_s32(127); // 127
+
+ const int32x4_t CONST_127 = vdupq_n_s32(127); // 127
const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
-
+
// Extract exponent
- int32x4_t m = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
- float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
-
+ int32x4_t m = vsubq_s32(
+ vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
+ float32x4_t val =
+ vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
+
// Polynomial Approximation
float32x4_t poly = _vtaylor_polyq_f32(val, log_tab);
-
+
// Reconstruct
poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
-
+
return poly;
}
/* Evaluation of 4 sines & cosines at once.
* Optimized from here (zlib license)
* http://gruntthepeon.free.fr/ssemath/ */
-static inline float32x4x2_t _vsincosq_f32(float32x4_t x) {
+static inline float32x4x2_t _vsincosq_f32(float32x4_t x)
+{
const float32x4_t c_minus_cephes_DP1 = vdupq_n_f32(-0.78515625);
const float32x4_t c_minus_cephes_DP2 = vdupq_n_f32(-2.4187564849853515625e-4);
const float32x4_t c_minus_cephes_DP3 = vdupq_n_f32(-3.77489497744594108e-8);
const float32x4_t c_sincof_p0 = vdupq_n_f32(-1.9515295891e-4);
- const float32x4_t c_sincof_p1 = vdupq_n_f32(8.3321608736e-3);
+ const float32x4_t c_sincof_p1 = vdupq_n_f32(8.3321608736e-3);
const float32x4_t c_sincof_p2 = vdupq_n_f32(-1.6666654611e-1);
const float32x4_t c_coscof_p0 = vdupq_n_f32(2.443315711809948e-005);
const float32x4_t c_coscof_p1 = vdupq_n_f32(-1.388731625493765e-003);
const float32x4_t c_coscof_p2 = vdupq_n_f32(4.166664568298827e-002);
const float32x4_t c_cephes_FOPI = vdupq_n_f32(1.27323954473516); // 4 / M_PI
-
+
const float32x4_t CONST_1 = vdupq_n_f32(1.f);
const float32x4_t CONST_1_2 = vdupq_n_f32(0.5f);
const float32x4_t CONST_0 = vdupq_n_f32(0.f);
- const uint32x4_t CONST_2 = vdupq_n_u32(2);
- const uint32x4_t CONST_4 = vdupq_n_u32(4);
-
+ const uint32x4_t CONST_2 = vdupq_n_u32(2);
+ const uint32x4_t CONST_4 = vdupq_n_u32(4);
+
uint32x4_t emm2;
-
+
uint32x4_t sign_mask_sin, sign_mask_cos;
sign_mask_sin = vcltq_f32(x, CONST_0);
x = vabsq_f32(x);
// scale by 4/pi
float32x4_t y = vmulq_f32(x, c_cephes_FOPI);
-
+
// store the integer part of y in mm0
emm2 = vcvtq_u32_f32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
y = vcvtq_f32_u32(emm2);
-
+
/* get the polynom selection mask
there is one polynom for 0 <= x <= Pi/4
and another one for Pi/4<x<=Pi/2
Both branches will be computed. */
const uint32x4_t poly_mask = vtstq_u32(emm2, CONST_2);
-
+
// The magic pass: "Extended precision modular arithmetic"
x = vmlaq_f32(x, y, c_minus_cephes_DP1);
x = vmlaq_f32(x, y, c_minus_cephes_DP2);
x = vmlaq_f32(x, y, c_minus_cephes_DP3);
-
+
sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, CONST_4));
sign_mask_cos = vtstq_u32(vsubq_u32(emm2, CONST_2), CONST_4);
-
+
/* Evaluate the first polynom (0 <= x <= Pi/4) in y1,
and the second polynom (Pi/4 <= x <= 0) in y2 */
float32x4_t y1, y2;
- float32x4_t z = vmulq_f32(x,x);
-
+ float32x4_t z = vmulq_f32(x, x);
+
y1 = vmlaq_f32(c_coscof_p1, z, c_coscof_p0);
y1 = vmlaq_f32(c_coscof_p2, z, y1);
y1 = vmulq_f32(y1, z);
y1 = vmulq_f32(y1, z);
y1 = vmlsq_f32(y1, z, CONST_1_2);
y1 = vaddq_f32(y1, CONST_1);
-
+
y2 = vmlaq_f32(c_sincof_p1, z, c_sincof_p0);
y2 = vmlaq_f32(c_sincof_p2, z, y2);
y2 = vmulq_f32(y2, z);
y2 = vmlaq_f32(x, x, y2);
-
+
/* select the correct result from the two polynoms */
const float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
const float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
-
+
float32x4x2_t sincos;
sincos.val[0] = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
sincos.val[1] = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
-
+
return sincos;
}
-static inline float32x4_t _vsinq_f32(float32x4_t x) {
+static inline float32x4_t _vsinq_f32(float32x4_t x)
+{
const float32x4x2_t sincos = _vsincosq_f32(x);
return sincos.val[0];
}
-static inline float32x4_t _vcosq_f32(float32x4_t x) {
+static inline float32x4_t _vcosq_f32(float32x4_t x)
+{
const float32x4x2_t sincos = _vsincosq_f32(x);
return sincos.val[1];
}
-static inline float32x4_t _vtanq_f32(float32x4_t x) {
+static inline float32x4_t _vtanq_f32(float32x4_t x)
+{
const float32x4x2_t sincos = _vsincosq_f32(x);
return vmulq_f32(sincos.val[0], _vinvq_f32(sincos.val[1]));
}
#ifndef INCLUDED_VOLK_PREFS_H
#define INCLUDED_VOLK_PREFS_H
-#include <volk/volk_common.h>
#include <stdbool.h>
#include <stdlib.h>
+#include <volk/volk_common.h>
__VOLK_DECL_BEGIN
-typedef struct volk_arch_pref
-{
- char name[128]; //name of the kernel
- char impl_a[128]; //best aligned impl
- char impl_u[128]; //best unaligned impl
+typedef struct volk_arch_pref {
+ char name[128]; // name of the kernel
+ char impl_a[128]; // best aligned impl
+ char impl_u[128]; // best unaligned impl
} volk_arch_pref_t;
////////////////////////////////////////////////////////////////////////
// if config file should be tested on existence for reading.
// returns \0 in the argument on failure.
////////////////////////////////////////////////////////////////////////
-VOLK_API void volk_get_config_path(char *, bool);
+VOLK_API void volk_get_config_path(char*, bool);
////////////////////////////////////////////////////////////////////////
// load prefs into global prefs struct
////////////////////////////////////////////////////////////////////////
-VOLK_API size_t volk_load_preferences(volk_arch_pref_t **);
+VOLK_API size_t volk_load_preferences(volk_arch_pref_t**);
__VOLK_DECL_END
-#endif //INCLUDED_VOLK_PREFS_H
+#endif // INCLUDED_VOLK_PREFS_H
/* -*- c++ -*- */
-/*
+/*
* Copyright 2015 Free Software Foundation, Inc.
- *
+ *
* This file is part of GNU Radio
- *
+ *
* GNU Radio is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3, or (at your option)
* any later version.
- *
+ *
* GNU Radio is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
- *
+ *
* You should have received a copy of the GNU General Public License
* along with GNU Radio; see the file COPYING. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street,
#define INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_
#include <pmmintrin.h>
-static inline __m128
-_mm_complexmul_ps(__m128 x, __m128 y)
+static inline __m128 _mm_complexmul_ps(__m128 x, __m128 y)
{
- __m128 yl, yh, tmp1, tmp2;
- yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
- yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
- tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
- x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
- tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
- return _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ __m128 yl, yh, tmp1, tmp2;
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+ tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+ tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+ return _mm_addsub_ps(tmp1,
+ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
}
-static inline __m128
-_mm_complexconjugatemul_ps(__m128 x, __m128 y)
+static inline __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
{
- const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
- y = _mm_xor_ps(y, conjugator); // conjugate y
- return _mm_complexmul_ps(x, y);
+ const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+ y = _mm_xor_ps(y, conjugator); // conjugate y
+ return _mm_complexmul_ps(x, y);
}
-static inline __m128
-_mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){
- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
- return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+static inline __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
+{
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+ return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
}
-static inline __m128
-_mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){
- return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2));
+static inline __m128 _mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
+{
+ return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2));
}
-static inline __m128
-_mm_scaled_norm_dist_ps_sse3(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar){
- /*
- * Calculate: |y - x|^2 * SNR_lin
- * Consider 'symbolsX' and 'pointsX' to be complex float
- * 'symbolsX' are 'y' and 'pointsX' are 'x'
- */
- const __m128 diff0 = _mm_sub_ps(symbols0, points0);
- const __m128 diff1 = _mm_sub_ps(symbols1, points1);
- const __m128 norms = _mm_magnitudesquared_ps_sse3(diff0, diff1);
- return _mm_mul_ps(norms, scalar);
+static inline __m128 _mm_scaled_norm_dist_ps_sse3(const __m128 symbols0,
+ const __m128 symbols1,
+ const __m128 points0,
+ const __m128 points1,
+ const __m128 scalar)
+{
+ /*
+ * Calculate: |y - x|^2 * SNR_lin
+ * Consider 'symbolsX' and 'pointsX' to be complex float
+ * 'symbolsX' are 'y' and 'pointsX' are 'x'
+ */
+ const __m128 diff0 = _mm_sub_ps(symbols0, points0);
+ const __m128 diff1 = _mm_sub_ps(symbols1, points1);
+ const __m128 norms = _mm_magnitudesquared_ps_sse3(diff0, diff1);
+ return _mm_mul_ps(norms, scalar);
}
#endif /* INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ */
/* -*- c++ -*- */
-/*
+/*
* Copyright 2015 Free Software Foundation, Inc.
- *
+ *
* This file is part of GNU Radio
- *
+ *
* GNU Radio is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3, or (at your option)
* any later version.
- *
+ *
* GNU Radio is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
- *
+ *
* You should have received a copy of the GNU General Public License
* along with GNU Radio; see the file COPYING. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street,
#define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
#include <xmmintrin.h>
-static inline __m128
-_mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2){
- __m128 iValue, qValue;
- // Arrange in i1i2i3i4 format
- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
- // Arrange in q1q2q3q4 format
- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
- iValue = _mm_mul_ps(iValue, iValue); // Square the I values
- qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
- return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
+{
+ __m128 iValue, qValue;
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
+ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+ return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
}
-static inline __m128
-_mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2){
- return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2));
+static inline __m128 _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2)
+{
+ return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2));
}
-static inline __m128
-_mm_scaled_norm_dist_ps_sse(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
+static inline __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0,
+ const __m128 symbols1,
+ const __m128 points0,
+ const __m128 points1,
+ const __m128 scalar)
{
- // calculate scalar * |x - y|^2
- const __m128 diff0 = _mm_sub_ps(symbols0, points0);
- const __m128 diff1 = _mm_sub_ps(symbols1, points1);
- const __m128 norms = _mm_magnitudesquared_ps(diff0, diff1);
- return _mm_mul_ps(norms, scalar);
+ // calculate scalar * |x - y|^2
+ const __m128 diff0 = _mm_sub_ps(symbols0, points0);
+ const __m128 diff1 = _mm_sub_ps(symbols1, points1);
+ const __m128 norms = _mm_magnitudesquared_ps(diff0, diff1);
+ return _mm_mul_ps(norms, scalar);
}
#endif /* INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_16i_32fc_dot_prod_32fc(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points)
- * \endcode
+ * void volk_16i_32fc_dot_prod_32fc(lv_32fc_t* result, const short* input, const lv_32fc_t
+ * * taps, unsigned int num_points) \endcode
*
* \b Inputs
* \li input: vector of shorts.
#ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
#define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
-#include <volk/volk_common.h>
#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_GENERIC
-static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
+static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result,
+ const short* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
- static const int N_UNROLL = 4;
+ static const int N_UNROLL = 4;
- lv_32fc_t acc0 = 0;
- lv_32fc_t acc1 = 0;
- lv_32fc_t acc2 = 0;
- lv_32fc_t acc3 = 0;
+ lv_32fc_t acc0 = 0;
+ lv_32fc_t acc1 = 0;
+ lv_32fc_t acc2 = 0;
+ lv_32fc_t acc3 = 0;
- unsigned i = 0;
- unsigned n = (num_points / N_UNROLL) * N_UNROLL;
+ unsigned i = 0;
+ unsigned n = (num_points / N_UNROLL) * N_UNROLL;
- for(i = 0; i < n; i += N_UNROLL) {
- acc0 += taps[i + 0] * (float)input[i + 0];
- acc1 += taps[i + 1] * (float)input[i + 1];
- acc2 += taps[i + 2] * (float)input[i + 2];
- acc3 += taps[i + 3] * (float)input[i + 3];
- }
+ for (i = 0; i < n; i += N_UNROLL) {
+ acc0 += taps[i + 0] * (float)input[i + 0];
+ acc1 += taps[i + 1] * (float)input[i + 1];
+ acc2 += taps[i + 2] * (float)input[i + 2];
+ acc3 += taps[i + 3] * (float)input[i + 3];
+ }
- for(; i < num_points; i++) {
- acc0 += taps[i] * (float)input[i];
- }
+ for (; i < num_points; i++) {
+ acc0 += taps[i] * (float)input[i];
+ }
- *result = acc0 + acc1 + acc2 + acc3;
+ *result = acc0 + acc1 + acc2 + acc3;
}
#endif /*LV_HAVE_GENERIC*/
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
-
- unsigned ii;
- unsigned quarter_points = num_points / 4;
- lv_32fc_t* tapsPtr = (lv_32fc_t*) taps;
- short* inputPtr = (short*) input;
- lv_32fc_t accumulator_vec[4];
-
- float32x4x2_t tapsVal, accumulator_val;
- int16x4_t input16;
- int32x4_t input32;
- float32x4_t input_float, prod_re, prod_im;
-
- accumulator_val.val[0] = vdupq_n_f32(0.0);
- accumulator_val.val[1] = vdupq_n_f32(0.0);
-
- for(ii = 0; ii < quarter_points; ++ii) {
- tapsVal = vld2q_f32((float*)tapsPtr);
- input16 = vld1_s16(inputPtr);
- // widen 16-bit int to 32-bit int
- input32 = vmovl_s16(input16);
- // convert 32-bit int to float with scale
- input_float = vcvtq_f32_s32(input32);
-
- prod_re = vmulq_f32(input_float, tapsVal.val[0]);
- prod_im = vmulq_f32(input_float, tapsVal.val[1]);
-
- accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
- accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
-
- tapsPtr += 4;
- inputPtr += 4;
- }
- vst2q_f32((float*)accumulator_vec, accumulator_val);
- accumulator_vec[0] += accumulator_vec[1];
- accumulator_vec[2] += accumulator_vec[3];
- accumulator_vec[0] += accumulator_vec[2];
-
- for(ii = quarter_points * 4; ii < num_points; ++ii) {
- accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++));
- }
-
- *result = accumulator_vec[0];
+static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result,
+ const short* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
+
+ unsigned ii;
+ unsigned quarter_points = num_points / 4;
+ lv_32fc_t* tapsPtr = (lv_32fc_t*)taps;
+ short* inputPtr = (short*)input;
+ lv_32fc_t accumulator_vec[4];
+
+ float32x4x2_t tapsVal, accumulator_val;
+ int16x4_t input16;
+ int32x4_t input32;
+ float32x4_t input_float, prod_re, prod_im;
+
+ accumulator_val.val[0] = vdupq_n_f32(0.0);
+ accumulator_val.val[1] = vdupq_n_f32(0.0);
+
+ for (ii = 0; ii < quarter_points; ++ii) {
+ tapsVal = vld2q_f32((float*)tapsPtr);
+ input16 = vld1_s16(inputPtr);
+ // widen 16-bit int to 32-bit int
+ input32 = vmovl_s16(input16);
+ // convert 32-bit int to float with scale
+ input_float = vcvtq_f32_s32(input32);
+
+ prod_re = vmulq_f32(input_float, tapsVal.val[0]);
+ prod_im = vmulq_f32(input_float, tapsVal.val[1]);
+
+ accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
+ accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
+
+ tapsPtr += 4;
+ inputPtr += 4;
+ }
+ vst2q_f32((float*)accumulator_vec, accumulator_val);
+ accumulator_vec[0] += accumulator_vec[1];
+ accumulator_vec[2] += accumulator_vec[3];
+ accumulator_vec[0] += accumulator_vec[2];
+
+ for (ii = quarter_points * 4; ii < num_points; ++ii) {
+ accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++));
+ }
+
+ *result = accumulator_vec[0];
}
#endif /*LV_HAVE_NEON*/
#if LV_HAVE_SSE && LV_HAVE_MMX
-static inline void volk_16i_32fc_dot_prod_32fc_u_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 8;
-
- float res[2];
- float *realpt = &res[0], *imagpt = &res[1];
- const short* aPtr = input;
- const float* bPtr = (float*)taps;
-
- __m64 m0, m1;
- __m128 f0, f1, f2, f3;
- __m128 a0Val, a1Val, a2Val, a3Val;
- __m128 b0Val, b1Val, b2Val, b3Val;
- __m128 c0Val, c1Val, c2Val, c3Val;
-
- __m128 dotProdVal0 = _mm_setzero_ps();
- __m128 dotProdVal1 = _mm_setzero_ps();
- __m128 dotProdVal2 = _mm_setzero_ps();
- __m128 dotProdVal3 = _mm_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
- m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
- f0 = _mm_cvtpi16_ps(m0);
- f1 = _mm_cvtpi16_ps(m0);
- f2 = _mm_cvtpi16_ps(m1);
- f3 = _mm_cvtpi16_ps(m1);
-
- a0Val = _mm_unpacklo_ps(f0, f1);
- a1Val = _mm_unpackhi_ps(f0, f1);
- a2Val = _mm_unpacklo_ps(f2, f3);
- a3Val = _mm_unpackhi_ps(f2, f3);
-
- b0Val = _mm_loadu_ps(bPtr);
- b1Val = _mm_loadu_ps(bPtr+4);
- b2Val = _mm_loadu_ps(bPtr+8);
- b3Val = _mm_loadu_ps(bPtr+12);
-
- c0Val = _mm_mul_ps(a0Val, b0Val);
- c1Val = _mm_mul_ps(a1Val, b1Val);
- c2Val = _mm_mul_ps(a2Val, b2Val);
- c3Val = _mm_mul_ps(a3Val, b3Val);
-
- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
-
- aPtr += 8;
- bPtr += 16;
- }
-
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-
- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- *realpt = dotProductVector[0];
- *imagpt = dotProductVector[1];
- *realpt += dotProductVector[2];
- *imagpt += dotProductVector[3];
-
- number = sixteenthPoints*8;
- for(;number < num_points; number++){
- *realpt += ((*aPtr) * (*bPtr++));
- *imagpt += ((*aPtr++) * (*bPtr++));
- }
-
- *result = *(lv_32fc_t*)(&res[0]);
+static inline void volk_16i_32fc_dot_prod_32fc_u_sse(lv_32fc_t* result,
+ const short* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 8;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const short* aPtr = input;
+ const float* bPtr = (float*)taps;
+
+ __m64 m0, m1;
+ __m128 f0, f1, f2, f3;
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for (; number < sixteenthPoints; number++) {
+
+ m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
+ m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
+ f0 = _mm_cvtpi16_ps(m0);
+ f1 = _mm_cvtpi16_ps(m0);
+ f2 = _mm_cvtpi16_ps(m1);
+ f3 = _mm_cvtpi16_ps(m1);
+
+ a0Val = _mm_unpacklo_ps(f0, f1);
+ a1Val = _mm_unpackhi_ps(f0, f1);
+ a2Val = _mm_unpacklo_ps(f2, f3);
+ a3Val = _mm_unpackhi_ps(f2, f3);
+
+ b0Val = _mm_loadu_ps(bPtr);
+ b1Val = _mm_loadu_ps(bPtr + 4);
+ b2Val = _mm_loadu_ps(bPtr + 8);
+ b3Val = _mm_loadu_ps(bPtr + 12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 8;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+
+ number = sixteenthPoints * 8;
+ for (; number < num_points; number++) {
+ *realpt += ((*aPtr) * (*bPtr++));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
}
#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
#if LV_HAVE_AVX2 && LV_HAVE_FMA
-static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float res[2];
- float *realpt = &res[0], *imagpt = &res[1];
- const short* aPtr = input;
- const float* bPtr = (float*)taps;
-
- __m128i m0, m1;
- __m256i f0, f1;
- __m256 g0, g1, h0, h1, h2, h3;
- __m256 a0Val, a1Val, a2Val, a3Val;
- __m256 b0Val, b1Val, b2Val, b3Val;
-
- __m256 dotProdVal0 = _mm256_setzero_ps();
- __m256 dotProdVal1 = _mm256_setzero_ps();
- __m256 dotProdVal2 = _mm256_setzero_ps();
- __m256 dotProdVal3 = _mm256_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- m0 = _mm_loadu_si128((__m128i const*) aPtr);
- m1 = _mm_loadu_si128((__m128i const*)(aPtr+8));
-
- f0 = _mm256_cvtepi16_epi32(m0);
- g0 = _mm256_cvtepi32_ps(f0);
- f1 = _mm256_cvtepi16_epi32(m1);
- g1 = _mm256_cvtepi32_ps(f1);
-
- h0 = _mm256_unpacklo_ps(g0, g0);
- h1 = _mm256_unpackhi_ps(g0, g0);
- h2 = _mm256_unpacklo_ps(g1, g1);
- h3 = _mm256_unpackhi_ps(g1, g1);
-
- a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
- a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
- a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
- a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
-
- b0Val = _mm256_loadu_ps(bPtr);
- b1Val = _mm256_loadu_ps(bPtr+8);
- b2Val = _mm256_loadu_ps(bPtr+16);
- b3Val = _mm256_loadu_ps(bPtr+24);
-
- dotProdVal0 = _mm256_fmadd_ps(a0Val,b0Val,dotProdVal0);
- dotProdVal1 = _mm256_fmadd_ps(a1Val,b1Val,dotProdVal1);
- dotProdVal2 = _mm256_fmadd_ps(a2Val,b2Val,dotProdVal2);
- dotProdVal3 = _mm256_fmadd_ps(a3Val,b3Val,dotProdVal3);
-
- aPtr += 16;
- bPtr += 32;
- }
-
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- *realpt = dotProductVector[0];
- *imagpt = dotProductVector[1];
- *realpt += dotProductVector[2];
- *imagpt += dotProductVector[3];
- *realpt += dotProductVector[4];
- *imagpt += dotProductVector[5];
- *realpt += dotProductVector[6];
- *imagpt += dotProductVector[7];
-
- number = sixteenthPoints*16;
- for(;number < num_points; number++){
- *realpt += ((*aPtr) * (*bPtr++));
- *imagpt += ((*aPtr++) * (*bPtr++));
- }
-
- *result = *(lv_32fc_t*)(&res[0]);
+static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
+ const short* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const short* aPtr = input;
+ const float* bPtr = (float*)taps;
+
+ __m128i m0, m1;
+ __m256i f0, f1;
+ __m256 g0, g1, h0, h1, h2, h3;
+ __m256 a0Val, a1Val, a2Val, a3Val;
+ __m256 b0Val, b1Val, b2Val, b3Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+ __m256 dotProdVal2 = _mm256_setzero_ps();
+ __m256 dotProdVal3 = _mm256_setzero_ps();
+
+ for (; number < sixteenthPoints; number++) {
+
+ m0 = _mm_loadu_si128((__m128i const*)aPtr);
+ m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
+
+ f0 = _mm256_cvtepi16_epi32(m0);
+ g0 = _mm256_cvtepi32_ps(f0);
+ f1 = _mm256_cvtepi16_epi32(m1);
+ g1 = _mm256_cvtepi32_ps(f1);
+
+ h0 = _mm256_unpacklo_ps(g0, g0);
+ h1 = _mm256_unpackhi_ps(g0, g0);
+ h2 = _mm256_unpacklo_ps(g1, g1);
+ h3 = _mm256_unpackhi_ps(g1, g1);
+
+ a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
+ a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
+ a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
+ a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
+
+ b0Val = _mm256_loadu_ps(bPtr);
+ b1Val = _mm256_loadu_ps(bPtr + 8);
+ b2Val = _mm256_loadu_ps(bPtr + 16);
+ b3Val = _mm256_loadu_ps(bPtr + 24);
+
+ dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
+ dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
+ dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
+ dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 32;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_store_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+ *realpt += dotProductVector[4];
+ *imagpt += dotProductVector[5];
+ *realpt += dotProductVector[6];
+ *imagpt += dotProductVector[7];
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *realpt += ((*aPtr) * (*bPtr++));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
}
#endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
#ifdef LV_HAVE_AVX2
-static inline void volk_16i_32fc_dot_prod_32fc_u_avx2( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float res[2];
- float *realpt = &res[0], *imagpt = &res[1];
- const short* aPtr = input;
- const float* bPtr = (float*)taps;
-
- __m128i m0, m1;
- __m256i f0, f1;
- __m256 g0, g1, h0, h1, h2, h3;
- __m256 a0Val, a1Val, a2Val, a3Val;
- __m256 b0Val, b1Val, b2Val, b3Val;
- __m256 c0Val, c1Val, c2Val, c3Val;
-
- __m256 dotProdVal0 = _mm256_setzero_ps();
- __m256 dotProdVal1 = _mm256_setzero_ps();
- __m256 dotProdVal2 = _mm256_setzero_ps();
- __m256 dotProdVal3 = _mm256_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- m0 = _mm_loadu_si128((__m128i const*) aPtr);
- m1 = _mm_loadu_si128((__m128i const*)(aPtr+8));
-
- f0 = _mm256_cvtepi16_epi32(m0);
- g0 = _mm256_cvtepi32_ps(f0);
- f1 = _mm256_cvtepi16_epi32(m1);
- g1 = _mm256_cvtepi32_ps(f1);
-
- h0 = _mm256_unpacklo_ps(g0, g0);
- h1 = _mm256_unpackhi_ps(g0, g0);
- h2 = _mm256_unpacklo_ps(g1, g1);
- h3 = _mm256_unpackhi_ps(g1, g1);
-
- a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
- a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
- a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
- a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
-
- b0Val = _mm256_loadu_ps(bPtr);
- b1Val = _mm256_loadu_ps(bPtr+8);
- b2Val = _mm256_loadu_ps(bPtr+16);
- b3Val = _mm256_loadu_ps(bPtr+24);
-
- c0Val = _mm256_mul_ps(a0Val, b0Val);
- c1Val = _mm256_mul_ps(a1Val, b1Val);
- c2Val = _mm256_mul_ps(a2Val, b2Val);
- c3Val = _mm256_mul_ps(a3Val, b3Val);
-
- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
- dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
- dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
-
- aPtr += 16;
- bPtr += 32;
- }
-
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- *realpt = dotProductVector[0];
- *imagpt = dotProductVector[1];
- *realpt += dotProductVector[2];
- *imagpt += dotProductVector[3];
- *realpt += dotProductVector[4];
- *imagpt += dotProductVector[5];
- *realpt += dotProductVector[6];
- *imagpt += dotProductVector[7];
-
- number = sixteenthPoints*16;
- for(;number < num_points; number++){
- *realpt += ((*aPtr) * (*bPtr++));
- *imagpt += ((*aPtr++) * (*bPtr++));
- }
-
- *result = *(lv_32fc_t*)(&res[0]);
+static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(lv_32fc_t* result,
+ const short* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const short* aPtr = input;
+ const float* bPtr = (float*)taps;
+
+ __m128i m0, m1;
+ __m256i f0, f1;
+ __m256 g0, g1, h0, h1, h2, h3;
+ __m256 a0Val, a1Val, a2Val, a3Val;
+ __m256 b0Val, b1Val, b2Val, b3Val;
+ __m256 c0Val, c1Val, c2Val, c3Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+ __m256 dotProdVal2 = _mm256_setzero_ps();
+ __m256 dotProdVal3 = _mm256_setzero_ps();
+
+ for (; number < sixteenthPoints; number++) {
+
+ m0 = _mm_loadu_si128((__m128i const*)aPtr);
+ m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
+
+ f0 = _mm256_cvtepi16_epi32(m0);
+ g0 = _mm256_cvtepi32_ps(f0);
+ f1 = _mm256_cvtepi16_epi32(m1);
+ g1 = _mm256_cvtepi32_ps(f1);
+
+ h0 = _mm256_unpacklo_ps(g0, g0);
+ h1 = _mm256_unpackhi_ps(g0, g0);
+ h2 = _mm256_unpacklo_ps(g1, g1);
+ h3 = _mm256_unpackhi_ps(g1, g1);
+
+ a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
+ a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
+ a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
+ a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
+
+ b0Val = _mm256_loadu_ps(bPtr);
+ b1Val = _mm256_loadu_ps(bPtr + 8);
+ b2Val = _mm256_loadu_ps(bPtr + 16);
+ b3Val = _mm256_loadu_ps(bPtr + 24);
+
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
+ c2Val = _mm256_mul_ps(a2Val, b2Val);
+ c3Val = _mm256_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 32;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_store_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+ *realpt += dotProductVector[4];
+ *imagpt += dotProductVector[5];
+ *realpt += dotProductVector[6];
+ *imagpt += dotProductVector[7];
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *realpt += ((*aPtr) * (*bPtr++));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
}
#endif /*LV_HAVE_AVX2*/
#if LV_HAVE_SSE && LV_HAVE_MMX
-static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 8;
-
- float res[2];
- float *realpt = &res[0], *imagpt = &res[1];
- const short* aPtr = input;
- const float* bPtr = (float*)taps;
-
- __m64 m0, m1;
- __m128 f0, f1, f2, f3;
- __m128 a0Val, a1Val, a2Val, a3Val;
- __m128 b0Val, b1Val, b2Val, b3Val;
- __m128 c0Val, c1Val, c2Val, c3Val;
-
- __m128 dotProdVal0 = _mm_setzero_ps();
- __m128 dotProdVal1 = _mm_setzero_ps();
- __m128 dotProdVal2 = _mm_setzero_ps();
- __m128 dotProdVal3 = _mm_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
- m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
- f0 = _mm_cvtpi16_ps(m0);
- f1 = _mm_cvtpi16_ps(m0);
- f2 = _mm_cvtpi16_ps(m1);
- f3 = _mm_cvtpi16_ps(m1);
-
- a0Val = _mm_unpacklo_ps(f0, f1);
- a1Val = _mm_unpackhi_ps(f0, f1);
- a2Val = _mm_unpacklo_ps(f2, f3);
- a3Val = _mm_unpackhi_ps(f2, f3);
-
- b0Val = _mm_load_ps(bPtr);
- b1Val = _mm_load_ps(bPtr+4);
- b2Val = _mm_load_ps(bPtr+8);
- b3Val = _mm_load_ps(bPtr+12);
-
- c0Val = _mm_mul_ps(a0Val, b0Val);
- c1Val = _mm_mul_ps(a1Val, b1Val);
- c2Val = _mm_mul_ps(a2Val, b2Val);
- c3Val = _mm_mul_ps(a3Val, b3Val);
-
- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
-
- aPtr += 8;
- bPtr += 16;
- }
-
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-
- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- *realpt = dotProductVector[0];
- *imagpt = dotProductVector[1];
- *realpt += dotProductVector[2];
- *imagpt += dotProductVector[3];
-
- number = sixteenthPoints*8;
- for(;number < num_points; number++){
- *realpt += ((*aPtr) * (*bPtr++));
- *imagpt += ((*aPtr++) * (*bPtr++));
- }
-
- *result = *(lv_32fc_t*)(&res[0]);
+static inline void volk_16i_32fc_dot_prod_32fc_a_sse(lv_32fc_t* result,
+ const short* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 8;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const short* aPtr = input;
+ const float* bPtr = (float*)taps;
+
+ __m64 m0, m1;
+ __m128 f0, f1, f2, f3;
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for (; number < sixteenthPoints; number++) {
+
+ m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
+ m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
+ f0 = _mm_cvtpi16_ps(m0);
+ f1 = _mm_cvtpi16_ps(m0);
+ f2 = _mm_cvtpi16_ps(m1);
+ f3 = _mm_cvtpi16_ps(m1);
+
+ a0Val = _mm_unpacklo_ps(f0, f1);
+ a1Val = _mm_unpackhi_ps(f0, f1);
+ a2Val = _mm_unpacklo_ps(f2, f3);
+ a3Val = _mm_unpackhi_ps(f2, f3);
+
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr + 4);
+ b2Val = _mm_load_ps(bPtr + 8);
+ b3Val = _mm_load_ps(bPtr + 12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 8;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+
+ number = sixteenthPoints * 8;
+ for (; number < num_points; number++) {
+ *realpt += ((*aPtr) * (*bPtr++));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
}
#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
#ifdef LV_HAVE_AVX2
-static inline void volk_16i_32fc_dot_prod_32fc_a_avx2( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float res[2];
- float *realpt = &res[0], *imagpt = &res[1];
- const short* aPtr = input;
- const float* bPtr = (float*)taps;
-
- __m128i m0, m1;
- __m256i f0, f1;
- __m256 g0, g1, h0, h1, h2, h3;
- __m256 a0Val, a1Val, a2Val, a3Val;
- __m256 b0Val, b1Val, b2Val, b3Val;
- __m256 c0Val, c1Val, c2Val, c3Val;
-
- __m256 dotProdVal0 = _mm256_setzero_ps();
- __m256 dotProdVal1 = _mm256_setzero_ps();
- __m256 dotProdVal2 = _mm256_setzero_ps();
- __m256 dotProdVal3 = _mm256_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- m0 = _mm_load_si128((__m128i const*) aPtr);
- m1 = _mm_load_si128((__m128i const*)(aPtr+8));
-
- f0 = _mm256_cvtepi16_epi32(m0);
- g0 = _mm256_cvtepi32_ps(f0);
- f1 = _mm256_cvtepi16_epi32(m1);
- g1 = _mm256_cvtepi32_ps(f1);
-
- h0 = _mm256_unpacklo_ps(g0, g0);
- h1 = _mm256_unpackhi_ps(g0, g0);
- h2 = _mm256_unpacklo_ps(g1, g1);
- h3 = _mm256_unpackhi_ps(g1, g1);
-
- a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
- a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
- a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
- a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
-
- b0Val = _mm256_load_ps(bPtr);
- b1Val = _mm256_load_ps(bPtr+8);
- b2Val = _mm256_load_ps(bPtr+16);
- b3Val = _mm256_load_ps(bPtr+24);
-
- c0Val = _mm256_mul_ps(a0Val, b0Val);
- c1Val = _mm256_mul_ps(a1Val, b1Val);
- c2Val = _mm256_mul_ps(a2Val, b2Val);
- c3Val = _mm256_mul_ps(a3Val, b3Val);
-
- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
- dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
- dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
-
- aPtr += 16;
- bPtr += 32;
- }
-
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- *realpt = dotProductVector[0];
- *imagpt = dotProductVector[1];
- *realpt += dotProductVector[2];
- *imagpt += dotProductVector[3];
- *realpt += dotProductVector[4];
- *imagpt += dotProductVector[5];
- *realpt += dotProductVector[6];
- *imagpt += dotProductVector[7];
-
- number = sixteenthPoints*16;
- for(;number < num_points; number++){
- *realpt += ((*aPtr) * (*bPtr++));
- *imagpt += ((*aPtr++) * (*bPtr++));
- }
-
- *result = *(lv_32fc_t*)(&res[0]);
+static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(lv_32fc_t* result,
+ const short* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const short* aPtr = input;
+ const float* bPtr = (float*)taps;
+
+ __m128i m0, m1;
+ __m256i f0, f1;
+ __m256 g0, g1, h0, h1, h2, h3;
+ __m256 a0Val, a1Val, a2Val, a3Val;
+ __m256 b0Val, b1Val, b2Val, b3Val;
+ __m256 c0Val, c1Val, c2Val, c3Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+ __m256 dotProdVal2 = _mm256_setzero_ps();
+ __m256 dotProdVal3 = _mm256_setzero_ps();
+
+ for (; number < sixteenthPoints; number++) {
+
+ m0 = _mm_load_si128((__m128i const*)aPtr);
+ m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
+
+ f0 = _mm256_cvtepi16_epi32(m0);
+ g0 = _mm256_cvtepi32_ps(f0);
+ f1 = _mm256_cvtepi16_epi32(m1);
+ g1 = _mm256_cvtepi32_ps(f1);
+
+ h0 = _mm256_unpacklo_ps(g0, g0);
+ h1 = _mm256_unpackhi_ps(g0, g0);
+ h2 = _mm256_unpacklo_ps(g1, g1);
+ h3 = _mm256_unpackhi_ps(g1, g1);
+
+ a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
+ a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
+ a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
+ a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
+
+ b0Val = _mm256_load_ps(bPtr);
+ b1Val = _mm256_load_ps(bPtr + 8);
+ b2Val = _mm256_load_ps(bPtr + 16);
+ b3Val = _mm256_load_ps(bPtr + 24);
+
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
+ c2Val = _mm256_mul_ps(a2Val, b2Val);
+ c3Val = _mm256_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 32;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_store_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+ *realpt += dotProductVector[4];
+ *imagpt += dotProductVector[5];
+ *realpt += dotProductVector[6];
+ *imagpt += dotProductVector[7];
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *realpt += ((*aPtr) * (*bPtr++));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
}
#if LV_HAVE_AVX2 && LV_HAVE_FMA
-static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float res[2];
- float *realpt = &res[0], *imagpt = &res[1];
- const short* aPtr = input;
- const float* bPtr = (float*)taps;
-
- __m128i m0, m1;
- __m256i f0, f1;
- __m256 g0, g1, h0, h1, h2, h3;
- __m256 a0Val, a1Val, a2Val, a3Val;
- __m256 b0Val, b1Val, b2Val, b3Val;
-
- __m256 dotProdVal0 = _mm256_setzero_ps();
- __m256 dotProdVal1 = _mm256_setzero_ps();
- __m256 dotProdVal2 = _mm256_setzero_ps();
- __m256 dotProdVal3 = _mm256_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- m0 = _mm_load_si128((__m128i const*) aPtr);
- m1 = _mm_load_si128((__m128i const*)(aPtr+8));
-
- f0 = _mm256_cvtepi16_epi32(m0);
- g0 = _mm256_cvtepi32_ps(f0);
- f1 = _mm256_cvtepi16_epi32(m1);
- g1 = _mm256_cvtepi32_ps(f1);
-
- h0 = _mm256_unpacklo_ps(g0, g0);
- h1 = _mm256_unpackhi_ps(g0, g0);
- h2 = _mm256_unpacklo_ps(g1, g1);
- h3 = _mm256_unpackhi_ps(g1, g1);
-
- a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
- a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
- a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
- a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
-
- b0Val = _mm256_load_ps(bPtr);
- b1Val = _mm256_load_ps(bPtr+8);
- b2Val = _mm256_load_ps(bPtr+16);
- b3Val = _mm256_load_ps(bPtr+24);
-
- dotProdVal0 = _mm256_fmadd_ps(a0Val,b0Val,dotProdVal0);
- dotProdVal1 = _mm256_fmadd_ps(a1Val,b1Val,dotProdVal1);
- dotProdVal2 = _mm256_fmadd_ps(a2Val,b2Val,dotProdVal2);
- dotProdVal3 = _mm256_fmadd_ps(a3Val,b3Val,dotProdVal3);
-
- aPtr += 16;
- bPtr += 32;
- }
-
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- *realpt = dotProductVector[0];
- *imagpt = dotProductVector[1];
- *realpt += dotProductVector[2];
- *imagpt += dotProductVector[3];
- *realpt += dotProductVector[4];
- *imagpt += dotProductVector[5];
- *realpt += dotProductVector[6];
- *imagpt += dotProductVector[7];
-
- number = sixteenthPoints*16;
- for(;number < num_points; number++){
- *realpt += ((*aPtr) * (*bPtr++));
- *imagpt += ((*aPtr++) * (*bPtr++));
- }
-
- *result = *(lv_32fc_t*)(&res[0]);
+static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
+ const short* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const short* aPtr = input;
+ const float* bPtr = (float*)taps;
+
+ __m128i m0, m1;
+ __m256i f0, f1;
+ __m256 g0, g1, h0, h1, h2, h3;
+ __m256 a0Val, a1Val, a2Val, a3Val;
+ __m256 b0Val, b1Val, b2Val, b3Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+ __m256 dotProdVal2 = _mm256_setzero_ps();
+ __m256 dotProdVal3 = _mm256_setzero_ps();
+
+ for (; number < sixteenthPoints; number++) {
+
+ m0 = _mm_load_si128((__m128i const*)aPtr);
+ m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
+
+ f0 = _mm256_cvtepi16_epi32(m0);
+ g0 = _mm256_cvtepi32_ps(f0);
+ f1 = _mm256_cvtepi16_epi32(m1);
+ g1 = _mm256_cvtepi32_ps(f1);
+
+ h0 = _mm256_unpacklo_ps(g0, g0);
+ h1 = _mm256_unpackhi_ps(g0, g0);
+ h2 = _mm256_unpacklo_ps(g1, g1);
+ h3 = _mm256_unpackhi_ps(g1, g1);
+
+ a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
+ a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
+ a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
+ a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
+
+ b0Val = _mm256_load_ps(bPtr);
+ b1Val = _mm256_load_ps(bPtr + 8);
+ b2Val = _mm256_load_ps(bPtr + 16);
+ b3Val = _mm256_load_ps(bPtr + 24);
+
+ dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
+ dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
+ dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
+ dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 32;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_store_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+ *realpt += dotProductVector[4];
+ *imagpt += dotProductVector[5];
+ *realpt += dotProductVector[6];
+ *imagpt += dotProductVector[7];
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *realpt += ((*aPtr) * (*bPtr++));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
}
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_16i_branch_4_state_8(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars)
- * \endcode
+ * void volk_16i_branch_4_state_8(short* target, short* src0, char** permuters, short*
+ * cntl2, short* cntl3, short* scalars) \endcode
*
* \b Inputs
* \li src0: <FIXME>
#ifdef LV_HAVE_SSSE3
-#include <xmmintrin.h>
#include <emmintrin.h>
#include <tmmintrin.h>
+#include <xmmintrin.h>
-static inline void
-volk_16i_branch_4_state_8_a_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars)
+static inline void volk_16i_branch_4_state_8_a_ssse3(short* target,
+ short* src0,
+ char** permuters,
+ short* cntl2,
+ short* cntl3,
+ short* scalars)
{
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11;
- __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11;
+ __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars;
- p_target = (__m128i*)target;
- p_src0 = (__m128i*)src0;
- p_cntl2 = (__m128i*)cntl2;
- p_cntl3 = (__m128i*)cntl3;
- p_scalars = (__m128i*)scalars;
+ p_target = (__m128i*)target;
+ p_src0 = (__m128i*)src0;
+ p_cntl2 = (__m128i*)cntl2;
+ p_cntl3 = (__m128i*)cntl3;
+ p_scalars = (__m128i*)scalars;
- xmm0 = _mm_load_si128(p_scalars);
+ xmm0 = _mm_load_si128(p_scalars);
- xmm1 = _mm_shufflelo_epi16(xmm0, 0);
- xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
- xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
- xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+ xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+ xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+ xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+ xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
- xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
- xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
- xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
- xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+ xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+ xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+ xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+ xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
- xmm0 = _mm_load_si128((__m128i*)permuters[0]);
- xmm6 = _mm_load_si128((__m128i*)permuters[1]);
- xmm8 = _mm_load_si128((__m128i*)permuters[2]);
- xmm10 = _mm_load_si128((__m128i*)permuters[3]);
+ xmm0 = _mm_load_si128((__m128i*)permuters[0]);
+ xmm6 = _mm_load_si128((__m128i*)permuters[1]);
+ xmm8 = _mm_load_si128((__m128i*)permuters[2]);
+ xmm10 = _mm_load_si128((__m128i*)permuters[3]);
- xmm5 = _mm_load_si128(p_src0);
- xmm0 = _mm_shuffle_epi8(xmm5, xmm0);
- xmm6 = _mm_shuffle_epi8(xmm5, xmm6);
- xmm8 = _mm_shuffle_epi8(xmm5, xmm8);
- xmm10 = _mm_shuffle_epi8(xmm5, xmm10);
+ xmm5 = _mm_load_si128(p_src0);
+ xmm0 = _mm_shuffle_epi8(xmm5, xmm0);
+ xmm6 = _mm_shuffle_epi8(xmm5, xmm6);
+ xmm8 = _mm_shuffle_epi8(xmm5, xmm8);
+ xmm10 = _mm_shuffle_epi8(xmm5, xmm10);
- xmm5 = _mm_add_epi16(xmm1, xmm2);
+ xmm5 = _mm_add_epi16(xmm1, xmm2);
- xmm6 = _mm_add_epi16(xmm2, xmm6);
- xmm8 = _mm_add_epi16(xmm1, xmm8);
+ xmm6 = _mm_add_epi16(xmm2, xmm6);
+ xmm8 = _mm_add_epi16(xmm1, xmm8);
- xmm7 = _mm_load_si128(p_cntl2);
- xmm9 = _mm_load_si128(p_cntl3);
+ xmm7 = _mm_load_si128(p_cntl2);
+ xmm9 = _mm_load_si128(p_cntl3);
- xmm0 = _mm_add_epi16(xmm5, xmm0);
+ xmm0 = _mm_add_epi16(xmm5, xmm0);
- xmm7 = _mm_and_si128(xmm7, xmm3);
- xmm9 = _mm_and_si128(xmm9, xmm4);
+ xmm7 = _mm_and_si128(xmm7, xmm3);
+ xmm9 = _mm_and_si128(xmm9, xmm4);
- xmm5 = _mm_load_si128(&p_cntl2[1]);
- xmm11 = _mm_load_si128(&p_cntl3[1]);
+ xmm5 = _mm_load_si128(&p_cntl2[1]);
+ xmm11 = _mm_load_si128(&p_cntl3[1]);
- xmm7 = _mm_add_epi16(xmm7, xmm9);
+ xmm7 = _mm_add_epi16(xmm7, xmm9);
- xmm5 = _mm_and_si128(xmm5, xmm3);
- xmm11 = _mm_and_si128(xmm11, xmm4);
+ xmm5 = _mm_and_si128(xmm5, xmm3);
+ xmm11 = _mm_and_si128(xmm11, xmm4);
- xmm0 = _mm_add_epi16(xmm0, xmm7);
+ xmm0 = _mm_add_epi16(xmm0, xmm7);
- xmm7 = _mm_load_si128(&p_cntl2[2]);
- xmm9 = _mm_load_si128(&p_cntl3[2]);
+ xmm7 = _mm_load_si128(&p_cntl2[2]);
+ xmm9 = _mm_load_si128(&p_cntl3[2]);
- xmm5 = _mm_add_epi16(xmm5, xmm11);
+ xmm5 = _mm_add_epi16(xmm5, xmm11);
- xmm7 = _mm_and_si128(xmm7, xmm3);
- xmm9 = _mm_and_si128(xmm9, xmm4);
+ xmm7 = _mm_and_si128(xmm7, xmm3);
+ xmm9 = _mm_and_si128(xmm9, xmm4);
- xmm6 = _mm_add_epi16(xmm6, xmm5);
+ xmm6 = _mm_add_epi16(xmm6, xmm5);
- xmm5 = _mm_load_si128(&p_cntl2[3]);
- xmm11 = _mm_load_si128(&p_cntl3[3]);
+ xmm5 = _mm_load_si128(&p_cntl2[3]);
+ xmm11 = _mm_load_si128(&p_cntl3[3]);
- xmm7 = _mm_add_epi16(xmm7, xmm9);
+ xmm7 = _mm_add_epi16(xmm7, xmm9);
- xmm5 = _mm_and_si128(xmm5, xmm3);
- xmm11 = _mm_and_si128(xmm11, xmm4);
+ xmm5 = _mm_and_si128(xmm5, xmm3);
+ xmm11 = _mm_and_si128(xmm11, xmm4);
- xmm8 = _mm_add_epi16(xmm8, xmm7);
+ xmm8 = _mm_add_epi16(xmm8, xmm7);
- xmm5 = _mm_add_epi16(xmm5, xmm11);
+ xmm5 = _mm_add_epi16(xmm5, xmm11);
- _mm_store_si128(p_target, xmm0);
- _mm_store_si128(&p_target[1], xmm6);
+ _mm_store_si128(p_target, xmm0);
+ _mm_store_si128(&p_target[1], xmm6);
- xmm10 = _mm_add_epi16(xmm5, xmm10);
+ xmm10 = _mm_add_epi16(xmm5, xmm10);
- _mm_store_si128(&p_target[2], xmm8);
+ _mm_store_si128(&p_target[2], xmm8);
- _mm_store_si128(&p_target[3], xmm10);
+ _mm_store_si128(&p_target[3], xmm10);
}
#endif /*LV_HAVE_SSEs*/
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_16i_branch_4_state_8_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars)
+static inline void volk_16i_branch_4_state_8_generic(short* target,
+ short* src0,
+ char** permuters,
+ short* cntl2,
+ short* cntl3,
+ short* scalars)
{
- int i = 0;
-
- int bound = 4;
-
- for(; i < bound; ++i) {
- target[i* 8] = src0[((char)permuters[i][0])/2]
- + ((i + 1)%2 * scalars[0])
- + (((i >> 1)^1) * scalars[1])
- + (cntl2[i * 8] & scalars[2])
- + (cntl3[i * 8] & scalars[3]);
- target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2]
- + ((i + 1)%2 * scalars[0])
- + (((i >> 1)^1) * scalars[1])
- + (cntl2[i * 8 + 1] & scalars[2])
- + (cntl3[i * 8 + 1] & scalars[3]);
- target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2]
- + ((i + 1)%2 * scalars[0])
- + (((i >> 1)^1) * scalars[1])
- + (cntl2[i * 8 + 2] & scalars[2])
- + (cntl3[i * 8 + 2] & scalars[3]);
- target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2]
- + ((i + 1)%2 * scalars[0])
- + (((i >> 1)^1) * scalars[1])
- + (cntl2[i * 8 + 3] & scalars[2])
- + (cntl3[i * 8 + 3] & scalars[3]);
- target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2]
- + ((i + 1)%2 * scalars[0])
- + (((i >> 1)^1) * scalars[1])
- + (cntl2[i * 8 + 4] & scalars[2])
- + (cntl3[i * 8 + 4] & scalars[3]);
- target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2]
- + ((i + 1)%2 * scalars[0])
- + (((i >> 1)^1) * scalars[1])
- + (cntl2[i * 8 + 5] & scalars[2])
- + (cntl3[i * 8 + 5] & scalars[3]);
- target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2]
- + ((i + 1)%2 * scalars[0])
- + (((i >> 1)^1) * scalars[1])
- + (cntl2[i * 8 + 6] & scalars[2])
- + (cntl3[i * 8 + 6] & scalars[3]);
- target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2]
- + ((i + 1)%2 * scalars[0])
- + (((i >> 1)^1) * scalars[1])
- + (cntl2[i * 8 + 7] & scalars[2])
- + (cntl3[i * 8 + 7] & scalars[3]);
- }
+ int i = 0;
+
+ int bound = 4;
+
+ for (; i < bound; ++i) {
+ target[i * 8] = src0[((char)permuters[i][0]) / 2] + ((i + 1) % 2 * scalars[0]) +
+ (((i >> 1) ^ 1) * scalars[1]) + (cntl2[i * 8] & scalars[2]) +
+ (cntl3[i * 8] & scalars[3]);
+ target[i * 8 + 1] = src0[((char)permuters[i][1 * 2]) / 2] +
+ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
+ (cntl2[i * 8 + 1] & scalars[2]) +
+ (cntl3[i * 8 + 1] & scalars[3]);
+ target[i * 8 + 2] = src0[((char)permuters[i][2 * 2]) / 2] +
+ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
+ (cntl2[i * 8 + 2] & scalars[2]) +
+ (cntl3[i * 8 + 2] & scalars[3]);
+ target[i * 8 + 3] = src0[((char)permuters[i][3 * 2]) / 2] +
+ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
+ (cntl2[i * 8 + 3] & scalars[2]) +
+ (cntl3[i * 8 + 3] & scalars[3]);
+ target[i * 8 + 4] = src0[((char)permuters[i][4 * 2]) / 2] +
+ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
+ (cntl2[i * 8 + 4] & scalars[2]) +
+ (cntl3[i * 8 + 4] & scalars[3]);
+ target[i * 8 + 5] = src0[((char)permuters[i][5 * 2]) / 2] +
+ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
+ (cntl2[i * 8 + 5] & scalars[2]) +
+ (cntl3[i * 8 + 5] & scalars[3]);
+ target[i * 8 + 6] = src0[((char)permuters[i][6 * 2]) / 2] +
+ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
+ (cntl2[i * 8 + 6] & scalars[2]) +
+ (cntl3[i * 8 + 6] & scalars[3]);
+ target[i * 8 + 7] = src0[((char)permuters[i][7 * 2]) / 2] +
+ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
+ (cntl2[i * 8 + 7] & scalars[2]) +
+ (cntl3[i * 8 + 7] & scalars[3]);
+ }
}
#endif /*LV_HAVE_GENERIC*/
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_16i_convert_8i(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
- * \endcode
+ * void volk_16i_convert_8i(int8_t* outputVector, const int16_t* inputVector, unsigned int
+ * num_points) \endcode
*
* \b Inputs
* \li inputVector: The input vector of 16-bit shorts.
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_16i_convert_8i_u_avx2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
+static inline void volk_16i_convert_8i_u_avx2(int8_t* outputVector,
+ const int16_t* inputVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int thirtysecondPoints = num_points / 32;
+ unsigned int number = 0;
+ const unsigned int thirtysecondPoints = num_points / 32;
- int8_t* outputVectorPtr = outputVector;
- int16_t* inputPtr = (int16_t*)inputVector;
- __m256i inputVal1;
- __m256i inputVal2;
- __m256i ret;
+ int8_t* outputVectorPtr = outputVector;
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m256i inputVal1;
+ __m256i inputVal2;
+ __m256i ret;
- for(;number < thirtysecondPoints; number++){
+ for (; number < thirtysecondPoints; number++) {
- // Load the 16 values
- inputVal1 = _mm256_loadu_si256((__m256i*)inputPtr); inputPtr += 16;
- inputVal2 = _mm256_loadu_si256((__m256i*)inputPtr); inputPtr += 16;
+ // Load the 16 values
+ inputVal1 = _mm256_loadu_si256((__m256i*)inputPtr);
+ inputPtr += 16;
+ inputVal2 = _mm256_loadu_si256((__m256i*)inputPtr);
+ inputPtr += 16;
- inputVal1 = _mm256_srai_epi16(inputVal1, 8);
- inputVal2 = _mm256_srai_epi16(inputVal2, 8);
+ inputVal1 = _mm256_srai_epi16(inputVal1, 8);
+ inputVal2 = _mm256_srai_epi16(inputVal2, 8);
- ret = _mm256_packs_epi16(inputVal1, inputVal2);
- ret = _mm256_permute4x64_epi64(ret, 0b11011000);
+ ret = _mm256_packs_epi16(inputVal1, inputVal2);
+ ret = _mm256_permute4x64_epi64(ret, 0b11011000);
- _mm256_storeu_si256((__m256i*)outputVectorPtr, ret);
+ _mm256_storeu_si256((__m256i*)outputVectorPtr, ret);
- outputVectorPtr += 32;
- }
+ outputVectorPtr += 32;
+ }
- number = thirtysecondPoints * 32;
- for(; number < num_points; number++){
- outputVector[number] =(int8_t)(inputVector[number] >> 8);
- }
+ number = thirtysecondPoints * 32;
+ for (; number < num_points; number++) {
+ outputVector[number] = (int8_t)(inputVector[number] >> 8);
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void
-volk_16i_convert_8i_u_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
+static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector,
+ const int16_t* inputVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- int8_t* outputVectorPtr = outputVector;
- int16_t* inputPtr = (int16_t*)inputVector;
- __m128i inputVal1;
- __m128i inputVal2;
- __m128i ret;
+ int8_t* outputVectorPtr = outputVector;
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal1;
+ __m128i inputVal2;
+ __m128i ret;
- for(;number < sixteenthPoints; number++){
+ for (; number < sixteenthPoints; number++) {
- // Load the 16 values
- inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
- inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
+ // Load the 16 values
+ inputVal1 = _mm_loadu_si128((__m128i*)inputPtr);
+ inputPtr += 8;
+ inputVal2 = _mm_loadu_si128((__m128i*)inputPtr);
+ inputPtr += 8;
- inputVal1 = _mm_srai_epi16(inputVal1, 8);
- inputVal2 = _mm_srai_epi16(inputVal2, 8);
+ inputVal1 = _mm_srai_epi16(inputVal1, 8);
+ inputVal2 = _mm_srai_epi16(inputVal2, 8);
- ret = _mm_packs_epi16(inputVal1, inputVal2);
+ ret = _mm_packs_epi16(inputVal1, inputVal2);
- _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
+ _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
- outputVectorPtr += 16;
- }
+ outputVectorPtr += 16;
+ }
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- outputVector[number] =(int8_t)(inputVector[number] >> 8);
- }
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ outputVector[number] = (int8_t)(inputVector[number] >> 8);
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_16i_convert_8i_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
+static inline void volk_16i_convert_8i_generic(int8_t* outputVector,
+ const int16_t* inputVector,
+ unsigned int num_points)
{
- int8_t* outputVectorPtr = outputVector;
- const int16_t* inputVectorPtr = inputVector;
- unsigned int number = 0;
+ int8_t* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
- }
+ for (number = 0; number < num_points; number++) {
+ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+ }
}
#endif /* LV_HAVE_GENERIC */
-
-
#endif /* INCLUDED_volk_16i_convert_8i_u_H */
#ifndef INCLUDED_volk_16i_convert_8i_a_H
#define INCLUDED_volk_16i_convert_8i_a_H
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_16i_convert_8i_a_avx2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
+static inline void volk_16i_convert_8i_a_avx2(int8_t* outputVector,
+ const int16_t* inputVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int thirtysecondPoints = num_points / 32;
+ unsigned int number = 0;
+ const unsigned int thirtysecondPoints = num_points / 32;
- int8_t* outputVectorPtr = outputVector;
- int16_t* inputPtr = (int16_t*)inputVector;
- __m256i inputVal1;
- __m256i inputVal2;
- __m256i ret;
+ int8_t* outputVectorPtr = outputVector;
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m256i inputVal1;
+ __m256i inputVal2;
+ __m256i ret;
- for(;number < thirtysecondPoints; number++){
+ for (; number < thirtysecondPoints; number++) {
- // Load the 16 values
- inputVal1 = _mm256_load_si256((__m256i*)inputPtr); inputPtr += 16;
- inputVal2 = _mm256_load_si256((__m256i*)inputPtr); inputPtr += 16;
+ // Load the 16 values
+ inputVal1 = _mm256_load_si256((__m256i*)inputPtr);
+ inputPtr += 16;
+ inputVal2 = _mm256_load_si256((__m256i*)inputPtr);
+ inputPtr += 16;
- inputVal1 = _mm256_srai_epi16(inputVal1, 8);
- inputVal2 = _mm256_srai_epi16(inputVal2, 8);
+ inputVal1 = _mm256_srai_epi16(inputVal1, 8);
+ inputVal2 = _mm256_srai_epi16(inputVal2, 8);
- ret = _mm256_packs_epi16(inputVal1, inputVal2);
- ret = _mm256_permute4x64_epi64(ret, 0b11011000);
+ ret = _mm256_packs_epi16(inputVal1, inputVal2);
+ ret = _mm256_permute4x64_epi64(ret, 0b11011000);
- _mm256_store_si256((__m256i*)outputVectorPtr, ret);
+ _mm256_store_si256((__m256i*)outputVectorPtr, ret);
- outputVectorPtr += 32;
- }
+ outputVectorPtr += 32;
+ }
- number = thirtysecondPoints * 32;
- for(; number < num_points; number++){
- outputVector[number] =(int8_t)(inputVector[number] >> 8);
- }
+ number = thirtysecondPoints * 32;
+ for (; number < num_points; number++) {
+ outputVector[number] = (int8_t)(inputVector[number] >> 8);
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void
-volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
+static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector,
+ const int16_t* inputVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- int8_t* outputVectorPtr = outputVector;
- int16_t* inputPtr = (int16_t*)inputVector;
- __m128i inputVal1;
- __m128i inputVal2;
- __m128i ret;
+ int8_t* outputVectorPtr = outputVector;
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal1;
+ __m128i inputVal2;
+ __m128i ret;
- for(;number < sixteenthPoints; number++){
+ for (; number < sixteenthPoints; number++) {
- // Load the 16 values
- inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
- inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+ // Load the 16 values
+ inputVal1 = _mm_load_si128((__m128i*)inputPtr);
+ inputPtr += 8;
+ inputVal2 = _mm_load_si128((__m128i*)inputPtr);
+ inputPtr += 8;
- inputVal1 = _mm_srai_epi16(inputVal1, 8);
- inputVal2 = _mm_srai_epi16(inputVal2, 8);
+ inputVal1 = _mm_srai_epi16(inputVal1, 8);
+ inputVal2 = _mm_srai_epi16(inputVal2, 8);
- ret = _mm_packs_epi16(inputVal1, inputVal2);
+ ret = _mm_packs_epi16(inputVal1, inputVal2);
- _mm_store_si128((__m128i*)outputVectorPtr, ret);
+ _mm_store_si128((__m128i*)outputVectorPtr, ret);
- outputVectorPtr += 16;
- }
+ outputVectorPtr += 16;
+ }
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- outputVector[number] =(int8_t)(inputVector[number] >> 8);
- }
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ outputVector[number] = (int8_t)(inputVector[number] >> 8);
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_16i_convert_8i_neon(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
+static inline void volk_16i_convert_8i_neon(int8_t* outputVector,
+ const int16_t* inputVector,
+ unsigned int num_points)
{
- int8_t* outputVectorPtr = outputVector;
- const int16_t* inputVectorPtr = inputVector;
- unsigned int number = 0;
- unsigned int sixteenth_points = num_points / 16;
-
- int16x8_t inputVal0;
- int16x8_t inputVal1;
- int8x8_t outputVal0;
- int8x8_t outputVal1;
- int8x16_t outputVal;
-
- for(number = 0; number < sixteenth_points; number++){
- // load two input vectors
- inputVal0 = vld1q_s16(inputVectorPtr);
- inputVal1 = vld1q_s16(inputVectorPtr+8);
- // shift right
- outputVal0 = vshrn_n_s16(inputVal0, 8);
- outputVal1 = vshrn_n_s16(inputVal1, 8);
- // squash two vectors and write output
- outputVal = vcombine_s8(outputVal0, outputVal1);
- vst1q_s8(outputVectorPtr, outputVal);
- inputVectorPtr += 16;
- outputVectorPtr += 16;
- }
-
- for(number = sixteenth_points * 16; number < num_points; number++){
- *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
- }
+ int8_t* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ unsigned int sixteenth_points = num_points / 16;
+
+ int16x8_t inputVal0;
+ int16x8_t inputVal1;
+ int8x8_t outputVal0;
+ int8x8_t outputVal1;
+ int8x16_t outputVal;
+
+ for (number = 0; number < sixteenth_points; number++) {
+ // load two input vectors
+ inputVal0 = vld1q_s16(inputVectorPtr);
+ inputVal1 = vld1q_s16(inputVectorPtr + 8);
+ // shift right
+ outputVal0 = vshrn_n_s16(inputVal0, 8);
+ outputVal1 = vshrn_n_s16(inputVal1, 8);
+ // squash two vectors and write output
+ outputVal = vcombine_s8(outputVal0, outputVal1);
+ vst1q_s8(outputVectorPtr, outputVal);
+ inputVectorPtr += 16;
+ outputVectorPtr += 16;
+ }
+
+ for (number = sixteenth_points * 16; number < num_points; number++) {
+ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_16i_convert_8i_a_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
+static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector,
+ const int16_t* inputVector,
+ unsigned int num_points)
{
- int8_t* outputVectorPtr = outputVector;
- const int16_t* inputVectorPtr = inputVector;
- unsigned int number = 0;
+ int8_t* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
- }
+ for (number = 0; number < num_points; number++) {
+ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifndef INCLUDED_volk_16i_max_star_16i_a_H
#define INCLUDED_volk_16i_max_star_16i_a_H
-#include<inttypes.h>
-#include<stdio.h>
+#include <inttypes.h>
+#include <stdio.h>
#ifdef LV_HAVE_SSSE3
-#include<xmmintrin.h>
-#include<emmintrin.h>
-#include<tmmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <xmmintrin.h>
static inline void
volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_points)
{
- const unsigned int num_bytes = num_points*2;
+ const unsigned int num_bytes = num_points * 2;
- short candidate = src0[0];
- short cands[8];
- __m128i xmm0, xmm1, xmm3, xmm4, xmm5, xmm6;
+ short candidate = src0[0];
+ short cands[8];
+ __m128i xmm0, xmm1, xmm3, xmm4, xmm5, xmm6;
- __m128i *p_src0;
+ __m128i* p_src0;
- p_src0 = (__m128i*)src0;
+ p_src0 = (__m128i*)src0;
- int bound = num_bytes >> 4;
- int leftovers = (num_bytes >> 1) & 7;
+ int bound = num_bytes >> 4;
+ int leftovers = (num_bytes >> 1) & 7;
- int i = 0;
+ int i = 0;
- xmm1 = _mm_setzero_si128();
- xmm0 = _mm_setzero_si128();
- //_mm_insert_epi16(xmm0, candidate, 0);
+ xmm1 = _mm_setzero_si128();
+ xmm0 = _mm_setzero_si128();
+ //_mm_insert_epi16(xmm0, candidate, 0);
- xmm0 = _mm_shuffle_epi8(xmm0, xmm1);
+ xmm0 = _mm_shuffle_epi8(xmm0, xmm1);
- for(i = 0; i < bound; ++i) {
- xmm1 = _mm_load_si128(p_src0);
- p_src0 += 1;
- //xmm2 = _mm_sub_epi16(xmm1, xmm0);
+ for (i = 0; i < bound; ++i) {
+ xmm1 = _mm_load_si128(p_src0);
+ p_src0 += 1;
+ // xmm2 = _mm_sub_epi16(xmm1, xmm0);
- xmm3 = _mm_cmpgt_epi16(xmm0, xmm1);
- xmm4 = _mm_cmpeq_epi16(xmm0, xmm1);
- xmm5 = _mm_cmpgt_epi16(xmm1, xmm0);
+ xmm3 = _mm_cmpgt_epi16(xmm0, xmm1);
+ xmm4 = _mm_cmpeq_epi16(xmm0, xmm1);
+ xmm5 = _mm_cmpgt_epi16(xmm1, xmm0);
- xmm6 = _mm_xor_si128(xmm4, xmm5);
+ xmm6 = _mm_xor_si128(xmm4, xmm5);
- xmm3 = _mm_and_si128(xmm3, xmm0);
- xmm4 = _mm_and_si128(xmm6, xmm1);
+ xmm3 = _mm_and_si128(xmm3, xmm0);
+ xmm4 = _mm_and_si128(xmm6, xmm1);
- xmm0 = _mm_add_epi16(xmm3, xmm4);
- }
+ xmm0 = _mm_add_epi16(xmm3, xmm4);
+ }
- _mm_store_si128((__m128i*)cands, xmm0);
+ _mm_store_si128((__m128i*)cands, xmm0);
- for(i = 0; i < 8; ++i) {
- candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i];
- }
+ for (i = 0; i < 8; ++i) {
+ candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i];
+ }
- for(i = 0; i < leftovers; ++i) {
- candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i];
- }
+ for (i = 0; i < leftovers; ++i) {
+ candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0)
+ ? candidate
+ : src0[(bound << 3) + i];
+ }
- target[0] = candidate;
+ target[0] = candidate;
}
#endif /*LV_HAVE_SSSE3*/
static inline void
volk_16i_max_star_16i_neon(short* target, short* src0, unsigned int num_points)
{
- const unsigned int eighth_points = num_points / 8;
- unsigned number;
- int16x8_t input_vec;
- int16x8_t diff, zeros;
- uint16x8_t comp1, comp2;
- zeros = vdupq_n_s16(0);
-
- int16x8x2_t tmpvec;
-
- int16x8_t candidate_vec = vld1q_dup_s16(src0 );
- short candidate;
- ++src0;
-
- for(number=0; number < eighth_points; ++number) {
- input_vec = vld1q_s16(src0);
- __VOLK_PREFETCH(src0+16);
- diff = vsubq_s16(candidate_vec, input_vec);
- comp1 = vcgeq_s16(diff, zeros);
- comp2 = vcltq_s16(diff, zeros);
-
- tmpvec.val[0] = vandq_s16(candidate_vec, (int16x8_t)comp1);
- tmpvec.val[1] = vandq_s16(input_vec, (int16x8_t)comp2);
-
- candidate_vec = vaddq_s16(tmpvec.val[0], tmpvec.val[1]);
- src0 += 8;
- }
- vst1q_s16(&candidate, candidate_vec);
-
- for(number=0; number < num_points%8; number++) {
- candidate = ((int16_t)(candidate - src0[number]) > 0) ? candidate : src0[number];
- }
- target[0] = candidate;
+ const unsigned int eighth_points = num_points / 8;
+ unsigned number;
+ int16x8_t input_vec;
+ int16x8_t diff, zeros;
+ uint16x8_t comp1, comp2;
+ zeros = vdupq_n_s16(0);
+
+ int16x8x2_t tmpvec;
+
+ int16x8_t candidate_vec = vld1q_dup_s16(src0);
+ short candidate;
+ ++src0;
+
+ for (number = 0; number < eighth_points; ++number) {
+ input_vec = vld1q_s16(src0);
+ __VOLK_PREFETCH(src0 + 16);
+ diff = vsubq_s16(candidate_vec, input_vec);
+ comp1 = vcgeq_s16(diff, zeros);
+ comp2 = vcltq_s16(diff, zeros);
+
+ tmpvec.val[0] = vandq_s16(candidate_vec, (int16x8_t)comp1);
+ tmpvec.val[1] = vandq_s16(input_vec, (int16x8_t)comp2);
+
+ candidate_vec = vaddq_s16(tmpvec.val[0], tmpvec.val[1]);
+ src0 += 8;
+ }
+ vst1q_s16(&candidate, candidate_vec);
+
+ for (number = 0; number < num_points % 8; number++) {
+ candidate = ((int16_t)(candidate - src0[number]) > 0) ? candidate : src0[number];
+ }
+ target[0] = candidate;
}
#endif /*LV_HAVE_NEON*/
static inline void
volk_16i_max_star_16i_generic(short* target, short* src0, unsigned int num_points)
{
- const unsigned int num_bytes = num_points*2;
+ const unsigned int num_bytes = num_points * 2;
- int i = 0;
+ int i = 0;
- int bound = num_bytes >> 1;
+ int bound = num_bytes >> 1;
- short candidate = src0[0];
- for(i = 1; i < bound; ++i) {
- candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i];
- }
- target[0] = candidate;
+ short candidate = src0[0];
+ for (i = 1; i < bound; ++i) {
+ candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i];
+ }
+ target[0] = candidate;
}
#endif /*LV_HAVE_GENERIC*/
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_16i_max_star_horizontal_16i(short* target, short* src0, unsigned int num_points);
- * \endcode
+ * void volk_16i_max_star_horizontal_16i(short* target, short* src0, unsigned int
+ * num_points); \endcode
*
* \b Inputs
* \li src0: The input vector.
#include <volk/volk_common.h>
-#include<inttypes.h>
-#include<stdio.h>
+#include <inttypes.h>
+#include <stdio.h>
#ifdef LV_HAVE_SSSE3
-#include<xmmintrin.h>
-#include<emmintrin.h>
-#include<tmmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <xmmintrin.h>
-static inline void
-volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigned int num_points)
+static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target,
+ int16_t* src0,
+ unsigned int num_points)
{
- const unsigned int num_bytes = num_points*2;
+ const unsigned int num_bytes = num_points * 2;
- static const uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
- static const uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
- 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d};
- static const uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
- static const uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
- 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02};
+ static const uint8_t shufmask0[16] = {
+ 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ };
+ static const uint8_t shufmask1[16] = {
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d
+ };
+ static const uint8_t andmask0[16] = {
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ };
+ static const uint8_t andmask1[16] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02
+ };
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
- __m128i xmm5, xmm6, xmm7, xmm8;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+ __m128i xmm5, xmm6, xmm7, xmm8;
- xmm4 = _mm_load_si128((__m128i*)shufmask0);
- xmm5 = _mm_load_si128((__m128i*)shufmask1);
- xmm6 = _mm_load_si128((__m128i*)andmask0);
- xmm7 = _mm_load_si128((__m128i*)andmask1);
+ xmm4 = _mm_load_si128((__m128i*)shufmask0);
+ xmm5 = _mm_load_si128((__m128i*)shufmask1);
+ xmm6 = _mm_load_si128((__m128i*)andmask0);
+ xmm7 = _mm_load_si128((__m128i*)andmask1);
- __m128i *p_target, *p_src0;
+ __m128i *p_target, *p_src0;
- p_target = (__m128i*)target;
- p_src0 = (__m128i*)src0;
+ p_target = (__m128i*)target;
+ p_src0 = (__m128i*)src0;
- int bound = num_bytes >> 5;
- int intermediate = (num_bytes >> 4) & 1;
- int leftovers = (num_bytes >> 1) & 7;
+ int bound = num_bytes >> 5;
+ int intermediate = (num_bytes >> 4) & 1;
+ int leftovers = (num_bytes >> 1) & 7;
- int i = 0;
+ int i = 0;
- for(i = 0; i < bound; ++i) {
- xmm0 = _mm_load_si128(p_src0);
- xmm1 = _mm_load_si128(&p_src0[1]);
+ for (i = 0; i < bound; ++i) {
+ xmm0 = _mm_load_si128(p_src0);
+ xmm1 = _mm_load_si128(&p_src0[1]);
- xmm2 = _mm_xor_si128(xmm2, xmm2);
- p_src0 += 2;
+ xmm2 = _mm_xor_si128(xmm2, xmm2);
+ p_src0 += 2;
- xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+ xmm3 = _mm_hsub_epi16(xmm0, xmm1);
- xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
+ xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
- xmm8 = _mm_and_si128(xmm2, xmm6);
- xmm3 = _mm_and_si128(xmm2, xmm7);
+ xmm8 = _mm_and_si128(xmm2, xmm6);
+ xmm3 = _mm_and_si128(xmm2, xmm7);
- xmm8 = _mm_add_epi8(xmm8, xmm4);
- xmm3 = _mm_add_epi8(xmm3, xmm5);
+ xmm8 = _mm_add_epi8(xmm8, xmm4);
+ xmm3 = _mm_add_epi8(xmm3, xmm5);
- xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
- xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
+ xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
+ xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
- xmm3 = _mm_add_epi16(xmm0, xmm1);
+ xmm3 = _mm_add_epi16(xmm0, xmm1);
- _mm_store_si128(p_target, xmm3);
+ _mm_store_si128(p_target, xmm3);
- p_target += 1;
- }
+ p_target += 1;
+ }
- if (intermediate) {
- xmm0 = _mm_load_si128(p_src0);
+ if (intermediate) {
+ xmm0 = _mm_load_si128(p_src0);
- xmm2 = _mm_xor_si128(xmm2, xmm2);
- p_src0 += 1;
+ xmm2 = _mm_xor_si128(xmm2, xmm2);
+ p_src0 += 1;
- xmm3 = _mm_hsub_epi16(xmm0, xmm1);
- xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
+ xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+ xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
- xmm8 = _mm_and_si128(xmm2, xmm6);
+ xmm8 = _mm_and_si128(xmm2, xmm6);
- xmm3 = _mm_add_epi8(xmm8, xmm4);
+ xmm3 = _mm_add_epi8(xmm8, xmm4);
- xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
+ xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
- _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec);
+ _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec);
- p_target = (__m128i*)((int8_t*)p_target + 8);
- }
+ p_target = (__m128i*)((int8_t*)p_target + 8);
+ }
- for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) {
- target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
- }
+ for (i = (bound << 4) + (intermediate << 3);
+ i < (bound << 4) + (intermediate << 3) + leftovers;
+ i += 2) {
+ target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
+ }
}
#endif /*LV_HAVE_SSSE3*/
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_16i_max_star_horizontal_16i_neon(int16_t* target, int16_t* src0, unsigned int num_points)
+static inline void volk_16i_max_star_horizontal_16i_neon(int16_t* target,
+ int16_t* src0,
+ unsigned int num_points)
{
- const unsigned int eighth_points = num_points / 16;
- unsigned number;
- int16x8x2_t input_vec;
- int16x8_t diff, max_vec, zeros;
- uint16x8_t comp1, comp2;
- zeros = vdupq_n_s16(0);
- for(number=0; number < eighth_points; ++number) {
- input_vec = vld2q_s16(src0);
- //__VOLK_PREFETCH(src0+16);
- diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
- comp1 = vcgeq_s16(diff, zeros);
- comp2 = vcltq_s16(diff, zeros);
-
- input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1);
- input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2);
-
- max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]);
- vst1q_s16(target, max_vec);
- src0 += 16;
- target += 8;
- }
- for(number=0; number < num_points%16; number+=2) {
- target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0) ? src0[number] : src0[number+1];
- }
-
+ const unsigned int eighth_points = num_points / 16;
+ unsigned number;
+ int16x8x2_t input_vec;
+ int16x8_t diff, max_vec, zeros;
+ uint16x8_t comp1, comp2;
+ zeros = vdupq_n_s16(0);
+ for (number = 0; number < eighth_points; ++number) {
+ input_vec = vld2q_s16(src0);
+ //__VOLK_PREFETCH(src0+16);
+ diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
+ comp1 = vcgeq_s16(diff, zeros);
+ comp2 = vcltq_s16(diff, zeros);
+
+ input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1);
+ input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2);
+
+ max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]);
+ vst1q_s16(target, max_vec);
+ src0 += 16;
+ target += 8;
+ }
+ for (number = 0; number < num_points % 16; number += 2) {
+ target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0)
+ ? src0[number]
+ : src0[number + 1];
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_NEONV7
-extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target, int16_t* src0, unsigned int num_points);
+extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target,
+ int16_t* src0,
+ unsigned int num_points);
#endif /* LV_HAVE_NEONV7 */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_16i_max_star_horizontal_16i_generic(int16_t* target, int16_t* src0, unsigned int num_points)
+static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target,
+ int16_t* src0,
+ unsigned int num_points)
{
- const unsigned int num_bytes = num_points*2;
+ const unsigned int num_bytes = num_points * 2;
- int i = 0;
+ int i = 0;
- int bound = num_bytes >> 1;
+ int bound = num_bytes >> 1;
- for(i = 0; i < bound; i += 2) {
- target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1];
- }
+ for (i = 0; i < bound; i += 2) {
+ target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
+ }
}
#endif /*LV_HAVE_GENERIC*/
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_16i_permute_and_scalar_add(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_points)
- * \endcode
+ * void volk_16i_permute_and_scalar_add(short* target, short* src0, short*
+ * permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short*
+ * scalars, unsigned int num_points) \endcode
*
* \b Inputs
* \li src0: The input vector.
#ifndef INCLUDED_volk_16i_permute_and_scalar_add_a_H
#define INCLUDED_volk_16i_permute_and_scalar_add_a_H
-#include<inttypes.h>
-#include<stdio.h>
+#include <inttypes.h>
+#include <stdio.h>
#ifdef LV_HAVE_SSE2
-#include<xmmintrin.h>
-#include<emmintrin.h>
-
-static inline void
-volk_16i_permute_and_scalar_add_a_sse2(short* target, short* src0, short* permute_indexes,
- short* cntl0, short* cntl1, short* cntl2, short* cntl3,
- short* scalars, unsigned int num_points)
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target,
+ short* src0,
+ short* permute_indexes,
+ short* cntl0,
+ short* cntl1,
+ short* cntl2,
+ short* cntl3,
+ short* scalars,
+ unsigned int num_points)
{
- const unsigned int num_bytes = num_points*2;
+ const unsigned int num_bytes = num_points * 2;
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
- __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
+ __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
- short* p_permute_indexes = permute_indexes;
+ short* p_permute_indexes = permute_indexes;
- p_target = (__m128i*)target;
- p_cntl0 = (__m128i*)cntl0;
- p_cntl1 = (__m128i*)cntl1;
- p_cntl2 = (__m128i*)cntl2;
- p_cntl3 = (__m128i*)cntl3;
- p_scalars = (__m128i*)scalars;
+ p_target = (__m128i*)target;
+ p_cntl0 = (__m128i*)cntl0;
+ p_cntl1 = (__m128i*)cntl1;
+ p_cntl2 = (__m128i*)cntl2;
+ p_cntl3 = (__m128i*)cntl3;
+ p_scalars = (__m128i*)scalars;
- int i = 0;
+ int i = 0;
- int bound = (num_bytes >> 4);
- int leftovers = (num_bytes >> 1) & 7;
+ int bound = (num_bytes >> 4);
+ int leftovers = (num_bytes >> 1) & 7;
- xmm0 = _mm_load_si128(p_scalars);
+ xmm0 = _mm_load_si128(p_scalars);
- xmm1 = _mm_shufflelo_epi16(xmm0, 0);
- xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
- xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
- xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+ xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+ xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+ xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+ xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
- xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
- xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
- xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
- xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+ xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+ xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+ xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+ xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
- for(; i < bound; ++i) {
- xmm0 = _mm_setzero_si128();
- xmm5 = _mm_setzero_si128();
- xmm6 = _mm_setzero_si128();
- xmm7 = _mm_setzero_si128();
+ for (; i < bound; ++i) {
+ xmm0 = _mm_setzero_si128();
+ xmm5 = _mm_setzero_si128();
+ xmm6 = _mm_setzero_si128();
+ xmm7 = _mm_setzero_si128();
- xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
- xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
- xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
- xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
- xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
- xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
- xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
- xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
+ xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
+ xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
+ xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
+ xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
+ xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
+ xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
+ xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
+ xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
- xmm0 = _mm_add_epi16(xmm0, xmm5);
- xmm6 = _mm_add_epi16(xmm6, xmm7);
+ xmm0 = _mm_add_epi16(xmm0, xmm5);
+ xmm6 = _mm_add_epi16(xmm6, xmm7);
- p_permute_indexes += 8;
+ p_permute_indexes += 8;
- xmm0 = _mm_add_epi16(xmm0, xmm6);
+ xmm0 = _mm_add_epi16(xmm0, xmm6);
- xmm5 = _mm_load_si128(p_cntl0);
- xmm6 = _mm_load_si128(p_cntl1);
- xmm7 = _mm_load_si128(p_cntl2);
+ xmm5 = _mm_load_si128(p_cntl0);
+ xmm6 = _mm_load_si128(p_cntl1);
+ xmm7 = _mm_load_si128(p_cntl2);
- xmm5 = _mm_and_si128(xmm5, xmm1);
- xmm6 = _mm_and_si128(xmm6, xmm2);
- xmm7 = _mm_and_si128(xmm7, xmm3);
+ xmm5 = _mm_and_si128(xmm5, xmm1);
+ xmm6 = _mm_and_si128(xmm6, xmm2);
+ xmm7 = _mm_and_si128(xmm7, xmm3);
- xmm0 = _mm_add_epi16(xmm0, xmm5);
+ xmm0 = _mm_add_epi16(xmm0, xmm5);
- xmm5 = _mm_load_si128(p_cntl3);
+ xmm5 = _mm_load_si128(p_cntl3);
- xmm6 = _mm_add_epi16(xmm6, xmm7);
+ xmm6 = _mm_add_epi16(xmm6, xmm7);
- p_cntl0 += 1;
+ p_cntl0 += 1;
- xmm5 = _mm_and_si128(xmm5, xmm4);
+ xmm5 = _mm_and_si128(xmm5, xmm4);
- xmm0 = _mm_add_epi16(xmm0, xmm6);
+ xmm0 = _mm_add_epi16(xmm0, xmm6);
- p_cntl1 += 1;
- p_cntl2 += 1;
+ p_cntl1 += 1;
+ p_cntl2 += 1;
- xmm0 = _mm_add_epi16(xmm0, xmm5);
+ xmm0 = _mm_add_epi16(xmm0, xmm5);
- p_cntl3 += 1;
+ p_cntl3 += 1;
- _mm_store_si128(p_target, xmm0);
+ _mm_store_si128(p_target, xmm0);
- p_target += 1;
- }
+ p_target += 1;
+ }
- for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
- target[i] = src0[permute_indexes[i]]
- + (cntl0[i] & scalars[0])
- + (cntl1[i] & scalars[1])
- + (cntl2[i] & scalars[2])
- + (cntl3[i] & scalars[3]);
- }
+ for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+ target[i] = src0[permute_indexes[i]] + (cntl0[i] & scalars[0]) +
+ (cntl1[i] & scalars[1]) + (cntl2[i] & scalars[2]) +
+ (cntl3[i] & scalars[3]);
+ }
}
#endif /*LV_HAVE_SSE*/
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_16i_permute_and_scalar_add_generic(short* target, short* src0, short* permute_indexes,
- short* cntl0, short* cntl1, short* cntl2, short* cntl3,
- short* scalars, unsigned int num_points)
+static inline void volk_16i_permute_and_scalar_add_generic(short* target,
+ short* src0,
+ short* permute_indexes,
+ short* cntl0,
+ short* cntl1,
+ short* cntl2,
+ short* cntl3,
+ short* scalars,
+ unsigned int num_points)
{
- const unsigned int num_bytes = num_points*2;
+ const unsigned int num_bytes = num_points * 2;
- int i = 0;
+ int i = 0;
- int bound = num_bytes >> 1;
+ int bound = num_bytes >> 1;
- for(i = 0; i < bound; ++i) {
- target[i] = src0[permute_indexes[i]]
- + (cntl0[i] & scalars[0])
- + (cntl1[i] & scalars[1])
- + (cntl2[i] & scalars[2])
- + (cntl3[i] & scalars[3]);
- }
+ for (i = 0; i < bound; ++i) {
+ target[i] = src0[permute_indexes[i]] + (cntl0[i] & scalars[0]) +
+ (cntl1[i] & scalars[1]) + (cntl2[i] & scalars[2]) +
+ (cntl3[i] & scalars[3]);
+ }
}
#endif /*LV_HAVE_GENERIC*/
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_16i_s32f_convert_32f(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points);
- * \endcode
+ * void volk_16i_s32f_convert_32f(float* outputVector, const int16_t* inputVector, const
+ * float scalar, unsigned int num_points); \endcode
*
* \b Inputs
* \li inputVector: The input vector of 16-bit shorts.
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_16i_s32f_convert_32f_u_avx2(float* outputVector, const int16_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_16i_s32f_convert_32f_u_avx2(float* outputVector,
+ const int16_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- float* outputVectorPtr = outputVector;
- __m256 invScalar = _mm256_set1_ps(1.0/scalar);
- int16_t* inputPtr = (int16_t*)inputVector;
- __m128i inputVal;
- __m256i inputVal2;
- __m256 ret;
+ float* outputVectorPtr = outputVector;
+ __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal;
+ __m256i inputVal2;
+ __m256 ret;
- for(;number < eighthPoints; number++){
+ for (; number < eighthPoints; number++) {
- // Load the 8 values
- inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+ // Load the 8 values
+ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
- // Convert
- inputVal2 = _mm256_cvtepi16_epi32(inputVal);
+ // Convert
+ inputVal2 = _mm256_cvtepi16_epi32(inputVal);
- ret = _mm256_cvtepi32_ps(inputVal2);
- ret = _mm256_mul_ps(ret, invScalar);
+ ret = _mm256_cvtepi32_ps(inputVal2);
+ ret = _mm256_mul_ps(ret, invScalar);
- _mm256_storeu_ps(outputVectorPtr, ret);
+ _mm256_storeu_ps(outputVectorPtr, ret);
- outputVectorPtr += 8;
+ outputVectorPtr += 8;
- inputPtr += 8;
- }
+ inputPtr += 8;
+ }
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- outputVector[number] =((float)(inputVector[number])) / scalar;
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ outputVector[number] = ((float)(inputVector[number])) / scalar;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_16i_s32f_convert_32f_u_avx(float* outputVector, const int16_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_16i_s32f_convert_32f_u_avx(float* outputVector,
+ const int16_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- float* outputVectorPtr = outputVector;
- __m128 invScalar = _mm_set_ps1(1.0/scalar);
- int16_t* inputPtr = (int16_t*)inputVector;
- __m128i inputVal, inputVal2;
- __m128 ret;
- __m256 output;
- __m256 dummy = _mm256_setzero_ps();
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal, inputVal2;
+ __m128 ret;
+ __m256 output;
+ __m256 dummy = _mm256_setzero_ps();
- for(;number < eighthPoints; number++){
+ for (; number < eighthPoints; number++) {
- // Load the 8 values
- //inputVal = _mm_loadu_si128((__m128i*)inputPtr);
- inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+ // Load the 8 values
+ // inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
- // Shift the input data to the right by 64 bits ( 8 bytes )
- inputVal2 = _mm_srli_si128(inputVal, 8);
+ // Shift the input data to the right by 64 bits ( 8 bytes )
+ inputVal2 = _mm_srli_si128(inputVal, 8);
- // Convert the lower 4 values into 32 bit words
- inputVal = _mm_cvtepi16_epi32(inputVal);
- inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+ // Convert the lower 4 values into 32 bit words
+ inputVal = _mm_cvtepi16_epi32(inputVal);
+ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
- ret = _mm_cvtepi32_ps(inputVal);
- ret = _mm_mul_ps(ret, invScalar);
- output = _mm256_insertf128_ps(dummy, ret, 0);
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ output = _mm256_insertf128_ps(dummy, ret, 0);
- ret = _mm_cvtepi32_ps(inputVal2);
- ret = _mm_mul_ps(ret, invScalar);
- output = _mm256_insertf128_ps(output, ret, 1);
+ ret = _mm_cvtepi32_ps(inputVal2);
+ ret = _mm_mul_ps(ret, invScalar);
+ output = _mm256_insertf128_ps(output, ret, 1);
- _mm256_storeu_ps(outputVectorPtr, output);
+ _mm256_storeu_ps(outputVectorPtr, output);
- outputVectorPtr += 8;
+ outputVectorPtr += 8;
- inputPtr += 8;
- }
+ inputPtr += 8;
+ }
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- outputVector[number] =((float)(inputVector[number])) / scalar;
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ outputVector[number] = ((float)(inputVector[number])) / scalar;
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
-static inline void
-volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector,
+ const int16_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- float* outputVectorPtr = outputVector;
- __m128 invScalar = _mm_set_ps1(1.0/scalar);
- int16_t* inputPtr = (int16_t*)inputVector;
- __m128i inputVal;
- __m128i inputVal2;
- __m128 ret;
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal;
+ __m128i inputVal2;
+ __m128 ret;
- for(;number < eighthPoints; number++){
+ for (; number < eighthPoints; number++) {
- // Load the 8 values
- inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+ // Load the 8 values
+ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
- // Shift the input data to the right by 64 bits ( 8 bytes )
- inputVal2 = _mm_srli_si128(inputVal, 8);
+ // Shift the input data to the right by 64 bits ( 8 bytes )
+ inputVal2 = _mm_srli_si128(inputVal, 8);
- // Convert the lower 4 values into 32 bit words
- inputVal = _mm_cvtepi16_epi32(inputVal);
- inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+ // Convert the lower 4 values into 32 bit words
+ inputVal = _mm_cvtepi16_epi32(inputVal);
+ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
- ret = _mm_cvtepi32_ps(inputVal);
- ret = _mm_mul_ps(ret, invScalar);
- _mm_storeu_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
- ret = _mm_cvtepi32_ps(inputVal2);
- ret = _mm_mul_ps(ret, invScalar);
- _mm_storeu_ps(outputVectorPtr, ret);
+ ret = _mm_cvtepi32_ps(inputVal2);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
+ outputVectorPtr += 4;
- inputPtr += 8;
- }
+ inputPtr += 8;
+ }
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- outputVector[number] =((float)(inputVector[number])) / scalar;
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ outputVector[number] = ((float)(inputVector[number])) / scalar;
+ }
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector,
+ const int16_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- float* outputVectorPtr = outputVector;
- __m128 invScalar = _mm_set_ps1(1.0/scalar);
- int16_t* inputPtr = (int16_t*)inputVector;
- __m128 ret;
-
- for(;number < quarterPoints; number++){
- ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
-
- ret = _mm_mul_ps(ret, invScalar);
- _mm_storeu_ps(outputVectorPtr, ret);
-
- inputPtr += 4;
- outputVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] = (float)(inputVector[number]) / scalar;
- }
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128 ret;
+
+ for (; number < quarterPoints; number++) {
+ ret = _mm_set_ps((float)(inputPtr[3]),
+ (float)(inputPtr[2]),
+ (float)(inputPtr[1]),
+ (float)(inputPtr[0]));
+
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ inputPtr += 4;
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ outputVector[number] = (float)(inputVector[number]) / scalar;
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_16i_s32f_convert_32f_generic(float* outputVector,
+ const int16_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- float* outputVectorPtr = outputVector;
- const int16_t* inputVectorPtr = inputVector;
- unsigned int number = 0;
+ float* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
- }
+ for (number = 0; number < num_points; number++) {
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_16i_s32f_convert_32f_neon(float* outputVector, const int16_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_16i_s32f_convert_32f_neon(float* outputVector,
+ const int16_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- float* outputPtr = outputVector;
- const int16_t* inputPtr = inputVector;
- unsigned int number = 0;
- unsigned int eighth_points = num_points / 8;
-
- int16x4x2_t input16;
- int32x4_t input32_0, input32_1;
- float32x4_t input_float_0, input_float_1;
- float32x4x2_t output_float;
- float32x4_t inv_scale;
-
- inv_scale = vdupq_n_f32(1.0/scalar);
-
- // the generic disassembles to a 128-bit load
- // and duplicates every instruction to operate on 64-bits
- // at a time. This is only possible with lanes, which is faster
- // than just doing a vld1_s16, but still slower.
- for(number = 0; number < eighth_points; number++){
- input16 = vld2_s16(inputPtr);
- // widen 16-bit int to 32-bit int
- input32_0 = vmovl_s16(input16.val[0]);
- input32_1 = vmovl_s16(input16.val[1]);
- // convert 32-bit int to float with scale
- input_float_0 = vcvtq_f32_s32(input32_0);
- input_float_1 = vcvtq_f32_s32(input32_1);
- output_float.val[0] = vmulq_f32(input_float_0, inv_scale);
- output_float.val[1] = vmulq_f32(input_float_1, inv_scale);
- vst2q_f32(outputPtr, output_float);
- inputPtr += 8;
- outputPtr += 8;
- }
-
- for(number = eighth_points*8; number < num_points; number++){
- *outputPtr++ = ((float)(*inputPtr++)) / scalar;
- }
+ float* outputPtr = outputVector;
+ const int16_t* inputPtr = inputVector;
+ unsigned int number = 0;
+ unsigned int eighth_points = num_points / 8;
+
+ int16x4x2_t input16;
+ int32x4_t input32_0, input32_1;
+ float32x4_t input_float_0, input_float_1;
+ float32x4x2_t output_float;
+ float32x4_t inv_scale;
+
+ inv_scale = vdupq_n_f32(1.0 / scalar);
+
+ // the generic disassembles to a 128-bit load
+ // and duplicates every instruction to operate on 64-bits
+ // at a time. This is only possible with lanes, which is faster
+ // than just doing a vld1_s16, but still slower.
+ for (number = 0; number < eighth_points; number++) {
+ input16 = vld2_s16(inputPtr);
+ // widen 16-bit int to 32-bit int
+ input32_0 = vmovl_s16(input16.val[0]);
+ input32_1 = vmovl_s16(input16.val[1]);
+ // convert 32-bit int to float with scale
+ input_float_0 = vcvtq_f32_s32(input32_0);
+ input_float_1 = vcvtq_f32_s32(input32_1);
+ output_float.val[0] = vmulq_f32(input_float_0, inv_scale);
+ output_float.val[1] = vmulq_f32(input_float_1, inv_scale);
+ vst2q_f32(outputPtr, output_float);
+ inputPtr += 8;
+ outputPtr += 8;
+ }
+
+ for (number = eighth_points * 8; number < num_points; number++) {
+ *outputPtr++ = ((float)(*inputPtr++)) / scalar;
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_16i_s32f_convert_32f_a_avx2(float* outputVector, const int16_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_16i_s32f_convert_32f_a_avx2(float* outputVector,
+ const int16_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- float* outputVectorPtr = outputVector;
- __m256 invScalar = _mm256_set1_ps(1.0/scalar);
- int16_t* inputPtr = (int16_t*)inputVector;
- __m128i inputVal;
- __m256i inputVal2;
- __m256 ret;
+ float* outputVectorPtr = outputVector;
+ __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal;
+ __m256i inputVal2;
+ __m256 ret;
- for(;number < eighthPoints; number++){
+ for (; number < eighthPoints; number++) {
- // Load the 8 values
- inputVal = _mm_load_si128((__m128i*)inputPtr);
+ // Load the 8 values
+ inputVal = _mm_load_si128((__m128i*)inputPtr);
- // Convert
- inputVal2 = _mm256_cvtepi16_epi32(inputVal);
+ // Convert
+ inputVal2 = _mm256_cvtepi16_epi32(inputVal);
- ret = _mm256_cvtepi32_ps(inputVal2);
- ret = _mm256_mul_ps(ret, invScalar);
+ ret = _mm256_cvtepi32_ps(inputVal2);
+ ret = _mm256_mul_ps(ret, invScalar);
- _mm256_store_ps(outputVectorPtr, ret);
+ _mm256_store_ps(outputVectorPtr, ret);
- outputVectorPtr += 8;
+ outputVectorPtr += 8;
- inputPtr += 8;
- }
+ inputPtr += 8;
+ }
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- outputVector[number] =((float)(inputVector[number])) / scalar;
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ outputVector[number] = ((float)(inputVector[number])) / scalar;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_16i_s32f_convert_32f_a_avx(float* outputVector, const int16_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_16i_s32f_convert_32f_a_avx(float* outputVector,
+ const int16_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- float* outputVectorPtr = outputVector;
- __m128 invScalar = _mm_set_ps1(1.0/scalar);
- int16_t* inputPtr = (int16_t*)inputVector;
- __m128i inputVal, inputVal2;
- __m128 ret;
- __m256 output;
- __m256 dummy = _mm256_setzero_ps();
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal, inputVal2;
+ __m128 ret;
+ __m256 output;
+ __m256 dummy = _mm256_setzero_ps();
- for(;number < eighthPoints; number++){
+ for (; number < eighthPoints; number++) {
- // Load the 8 values
- //inputVal = _mm_loadu_si128((__m128i*)inputPtr);
- inputVal = _mm_load_si128((__m128i*)inputPtr);
+ // Load the 8 values
+ // inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+ inputVal = _mm_load_si128((__m128i*)inputPtr);
- // Shift the input data to the right by 64 bits ( 8 bytes )
- inputVal2 = _mm_srli_si128(inputVal, 8);
+ // Shift the input data to the right by 64 bits ( 8 bytes )
+ inputVal2 = _mm_srli_si128(inputVal, 8);
- // Convert the lower 4 values into 32 bit words
- inputVal = _mm_cvtepi16_epi32(inputVal);
- inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+ // Convert the lower 4 values into 32 bit words
+ inputVal = _mm_cvtepi16_epi32(inputVal);
+ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
- ret = _mm_cvtepi32_ps(inputVal);
- ret = _mm_mul_ps(ret, invScalar);
- output = _mm256_insertf128_ps(dummy, ret, 0);
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ output = _mm256_insertf128_ps(dummy, ret, 0);
- ret = _mm_cvtepi32_ps(inputVal2);
- ret = _mm_mul_ps(ret, invScalar);
- output = _mm256_insertf128_ps(output, ret, 1);
+ ret = _mm_cvtepi32_ps(inputVal2);
+ ret = _mm_mul_ps(ret, invScalar);
+ output = _mm256_insertf128_ps(output, ret, 1);
- _mm256_store_ps(outputVectorPtr, output);
+ _mm256_store_ps(outputVectorPtr, output);
- outputVectorPtr += 8;
+ outputVectorPtr += 8;
- inputPtr += 8;
- }
+ inputPtr += 8;
+ }
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- outputVector[number] =((float)(inputVector[number])) / scalar;
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ outputVector[number] = ((float)(inputVector[number])) / scalar;
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
-static inline void
-volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector,
+ const int16_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- float* outputVectorPtr = outputVector;
- __m128 invScalar = _mm_set_ps1(1.0/scalar);
- int16_t* inputPtr = (int16_t*)inputVector;
- __m128i inputVal;
- __m128i inputVal2;
- __m128 ret;
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal;
+ __m128i inputVal2;
+ __m128 ret;
- for(;number < eighthPoints; number++){
+ for (; number < eighthPoints; number++) {
- // Load the 8 values
- inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+ // Load the 8 values
+ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
- // Shift the input data to the right by 64 bits ( 8 bytes )
- inputVal2 = _mm_srli_si128(inputVal, 8);
+ // Shift the input data to the right by 64 bits ( 8 bytes )
+ inputVal2 = _mm_srli_si128(inputVal, 8);
- // Convert the lower 4 values into 32 bit words
- inputVal = _mm_cvtepi16_epi32(inputVal);
- inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+ // Convert the lower 4 values into 32 bit words
+ inputVal = _mm_cvtepi16_epi32(inputVal);
+ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
- ret = _mm_cvtepi32_ps(inputVal);
- ret = _mm_mul_ps(ret, invScalar);
- _mm_storeu_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
- ret = _mm_cvtepi32_ps(inputVal2);
- ret = _mm_mul_ps(ret, invScalar);
- _mm_storeu_ps(outputVectorPtr, ret);
+ ret = _mm_cvtepi32_ps(inputVal2);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
+ outputVectorPtr += 4;
- inputPtr += 8;
- }
+ inputPtr += 8;
+ }
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- outputVector[number] =((float)(inputVector[number])) / scalar;
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ outputVector[number] = ((float)(inputVector[number])) / scalar;
+ }
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector,
+ const int16_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- float* outputVectorPtr = outputVector;
- __m128 invScalar = _mm_set_ps1(1.0/scalar);
- int16_t* inputPtr = (int16_t*)inputVector;
- __m128 ret;
-
- for(;number < quarterPoints; number++){
- ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
-
- ret = _mm_mul_ps(ret, invScalar);
- _mm_storeu_ps(outputVectorPtr, ret);
-
- inputPtr += 4;
- outputVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] = (float)(inputVector[number]) / scalar;
- }
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128 ret;
+
+ for (; number < quarterPoints; number++) {
+ ret = _mm_set_ps((float)(inputPtr[3]),
+ (float)(inputPtr[2]),
+ (float)(inputPtr[1]),
+ (float)(inputPtr[0]));
+
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ inputPtr += 4;
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ outputVector[number] = (float)(inputVector[number]) / scalar;
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_16i_s32f_convert_32f_a_generic(float* outputVector,
+ const int16_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- float* outputVectorPtr = outputVector;
- const int16_t* inputVectorPtr = inputVector;
- unsigned int number = 0;
+ float* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
- }
+ for (number = 0; number < num_points; number++) {
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+ }
}
#endif /* LV_HAVE_GENERIC */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_16i_x4_quad_max_star_16i(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points)
- * \endcode
+ * void volk_16i_x4_quad_max_star_16i(short* target, short* src0, short* src1, short*
+ * src2, short* src3, unsigned int num_points) \endcode
*
* \b Inputs
* \li src0: The input vector 0.
#ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
#define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
-#include<inttypes.h>
-#include<stdio.h>
+#include <inttypes.h>
+#include <stdio.h>
#ifdef LV_HAVE_SSE2
-#include<emmintrin.h>
+#include <emmintrin.h>
-static inline void
-volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1,
- short* src2, short* src3, unsigned int num_points)
+static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target,
+ short* src0,
+ short* src1,
+ short* src2,
+ short* src3,
+ unsigned int num_points)
{
- const unsigned int num_bytes = num_points*2;
-
- int i = 0;
+ const unsigned int num_bytes = num_points * 2;
- int bound = (num_bytes >> 4);
- int bound_copy = bound;
- int leftovers = (num_bytes >> 1) & 7;
+ int i = 0;
- __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
- p_target = (__m128i*) target;
- p_src0 = (__m128i*)src0;
- p_src1 = (__m128i*)src1;
- p_src2 = (__m128i*)src2;
- p_src3 = (__m128i*)src3;
+ int bound = (num_bytes >> 4);
+ int bound_copy = bound;
+ int leftovers = (num_bytes >> 1) & 7;
- __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+ __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
+ p_target = (__m128i*)target;
+ p_src0 = (__m128i*)src0;
+ p_src1 = (__m128i*)src1;
+ p_src2 = (__m128i*)src2;
+ p_src3 = (__m128i*)src3;
- while(bound_copy > 0) {
- xmm1 = _mm_load_si128(p_src0);
- xmm2 = _mm_load_si128(p_src1);
- xmm3 = _mm_load_si128(p_src2);
- xmm4 = _mm_load_si128(p_src3);
+ __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
- xmm5 = _mm_setzero_si128();
- xmm6 = _mm_setzero_si128();
- xmm7 = xmm1;
- xmm8 = xmm3;
+ while (bound_copy > 0) {
+ xmm1 = _mm_load_si128(p_src0);
+ xmm2 = _mm_load_si128(p_src1);
+ xmm3 = _mm_load_si128(p_src2);
+ xmm4 = _mm_load_si128(p_src3);
- xmm1 = _mm_sub_epi16(xmm2, xmm1);
+ xmm5 = _mm_setzero_si128();
+ xmm6 = _mm_setzero_si128();
+ xmm7 = xmm1;
+ xmm8 = xmm3;
- xmm3 = _mm_sub_epi16(xmm4, xmm3);
+ xmm1 = _mm_sub_epi16(xmm2, xmm1);
- xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
- xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
+ xmm3 = _mm_sub_epi16(xmm4, xmm3);
- xmm2 = _mm_and_si128(xmm5, xmm2);
- xmm4 = _mm_and_si128(xmm6, xmm4);
- xmm5 = _mm_andnot_si128(xmm5, xmm7);
- xmm6 = _mm_andnot_si128(xmm6, xmm8);
+ xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
+ xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
- xmm5 = _mm_add_epi16(xmm2, xmm5);
- xmm6 = _mm_add_epi16(xmm4, xmm6);
+ xmm2 = _mm_and_si128(xmm5, xmm2);
+ xmm4 = _mm_and_si128(xmm6, xmm4);
+ xmm5 = _mm_andnot_si128(xmm5, xmm7);
+ xmm6 = _mm_andnot_si128(xmm6, xmm8);
- xmm1 = _mm_xor_si128(xmm1, xmm1);
- xmm2 = xmm5;
- xmm5 = _mm_sub_epi16(xmm6, xmm5);
- p_src0 += 1;
- bound_copy -= 1;
+ xmm5 = _mm_add_epi16(xmm2, xmm5);
+ xmm6 = _mm_add_epi16(xmm4, xmm6);
- xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
- p_src1 += 1;
+ xmm1 = _mm_xor_si128(xmm1, xmm1);
+ xmm2 = xmm5;
+ xmm5 = _mm_sub_epi16(xmm6, xmm5);
+ p_src0 += 1;
+ bound_copy -= 1;
- xmm6 = _mm_and_si128(xmm1, xmm6);
+ xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
+ p_src1 += 1;
- xmm1 = _mm_andnot_si128(xmm1, xmm2);
- p_src2 += 1;
+ xmm6 = _mm_and_si128(xmm1, xmm6);
- xmm1 = _mm_add_epi16(xmm6, xmm1);
- p_src3 += 1;
+ xmm1 = _mm_andnot_si128(xmm1, xmm2);
+ p_src2 += 1;
- _mm_store_si128(p_target, xmm1);
- p_target += 1;
+ xmm1 = _mm_add_epi16(xmm6, xmm1);
+ p_src3 += 1;
- }
+ _mm_store_si128(p_target, xmm1);
+ p_target += 1;
+ }
- /*__VOLK_ASM __VOLK_VOLATILE
- (
- "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t"
- "cmp $0, %[bound]\n\t"
- "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t"
+ /*__VOLK_ASM __VOLK_VOLATILE
+ (
+ "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t"
+ "cmp $0, %[bound]\n\t"
+ "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t"
- "movaps (%[src0]), %%xmm1\n\t"
- "movaps (%[src1]), %%xmm2\n\t"
- "movaps (%[src2]), %%xmm3\n\t"
- "movaps (%[src3]), %%xmm4\n\t"
+ "movaps (%[src0]), %%xmm1\n\t"
+ "movaps (%[src1]), %%xmm2\n\t"
+ "movaps (%[src2]), %%xmm3\n\t"
+ "movaps (%[src3]), %%xmm4\n\t"
- "pxor %%xmm5, %%xmm5\n\t"
- "pxor %%xmm6, %%xmm6\n\t"
- "movaps %%xmm1, %%xmm7\n\t"
- "movaps %%xmm3, %%xmm8\n\t"
- "psubw %%xmm2, %%xmm1\n\t"
- "psubw %%xmm4, %%xmm3\n\t"
+ "pxor %%xmm5, %%xmm5\n\t"
+ "pxor %%xmm6, %%xmm6\n\t"
+ "movaps %%xmm1, %%xmm7\n\t"
+ "movaps %%xmm3, %%xmm8\n\t"
+ "psubw %%xmm2, %%xmm1\n\t"
+ "psubw %%xmm4, %%xmm3\n\t"
- "pcmpgtw %%xmm1, %%xmm5\n\t"
- "pcmpgtw %%xmm3, %%xmm6\n\t"
+ "pcmpgtw %%xmm1, %%xmm5\n\t"
+ "pcmpgtw %%xmm3, %%xmm6\n\t"
- "pand %%xmm5, %%xmm2\n\t"
- "pand %%xmm6, %%xmm4\n\t"
- "pandn %%xmm7, %%xmm5\n\t"
- "pandn %%xmm8, %%xmm6\n\t"
+ "pand %%xmm5, %%xmm2\n\t"
+ "pand %%xmm6, %%xmm4\n\t"
+ "pandn %%xmm7, %%xmm5\n\t"
+ "pandn %%xmm8, %%xmm6\n\t"
- "paddw %%xmm2, %%xmm5\n\t"
- "paddw %%xmm4, %%xmm6\n\t"
+ "paddw %%xmm2, %%xmm5\n\t"
+ "paddw %%xmm4, %%xmm6\n\t"
- "pxor %%xmm1, %%xmm1\n\t"
- "movaps %%xmm5, %%xmm2\n\t"
+ "pxor %%xmm1, %%xmm1\n\t"
+ "movaps %%xmm5, %%xmm2\n\t"
- "psubw %%xmm6, %%xmm5\n\t"
- "add $16, %[src0]\n\t"
- "add $-1, %[bound]\n\t"
+ "psubw %%xmm6, %%xmm5\n\t"
+ "add $16, %[src0]\n\t"
+ "add $-1, %[bound]\n\t"
- "pcmpgtw %%xmm5, %%xmm1\n\t"
- "add $16, %[src1]\n\t"
+ "pcmpgtw %%xmm5, %%xmm1\n\t"
+ "add $16, %[src1]\n\t"
- "pand %%xmm1, %%xmm6\n\t"
+ "pand %%xmm1, %%xmm6\n\t"
- "pandn %%xmm2, %%xmm1\n\t"
- "add $16, %[src2]\n\t"
+ "pandn %%xmm2, %%xmm1\n\t"
+ "add $16, %[src2]\n\t"
- "paddw %%xmm6, %%xmm1\n\t"
- "add $16, %[src3]\n\t"
+ "paddw %%xmm6, %%xmm1\n\t"
+ "add $16, %[src3]\n\t"
- "movaps %%xmm1, (%[target])\n\t"
- "addw $16, %[target]\n\t"
- "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t"
+ "movaps %%xmm1, (%[target])\n\t"
+ "addw $16, %[target]\n\t"
+ "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t"
- "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t"
- :
- :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target)
- :
- );
- */
+ "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t"
+ :
+ :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2),
+ [src3]"r"(src3), [target]"r"(target)
+ :
+ );
+ */
- short temp0 = 0;
- short temp1 = 0;
- for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
- temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
- temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
- target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
- }
- return;
+ short temp0 = 0;
+ short temp1 = 0;
+ for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+ temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+ temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i];
+ target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
+ }
+ return;
}
#endif /*LV_HAVE_SSE2*/
#include <arm_neon.h>
-static inline void
-volk_16i_x4_quad_max_star_16i_neon(short* target, short* src0, short* src1,
- short* src2, short* src3, unsigned int num_points)
+static inline void volk_16i_x4_quad_max_star_16i_neon(short* target,
+ short* src0,
+ short* src1,
+ short* src2,
+ short* src3,
+ unsigned int num_points)
{
- const unsigned int eighth_points = num_points / 8;
- unsigned i;
-
- int16x8_t src0_vec, src1_vec, src2_vec, src3_vec;
- int16x8_t diff12, diff34;
- int16x8_t comp0, comp1, comp2, comp3;
- int16x8_t result1_vec, result2_vec;
- int16x8_t zeros;
- zeros = vdupq_n_s16(0);
- for(i=0; i < eighth_points; ++i) {
- src0_vec = vld1q_s16(src0);
- src1_vec = vld1q_s16(src1);
- src2_vec = vld1q_s16(src2);
- src3_vec = vld1q_s16(src3);
- diff12 = vsubq_s16(src0_vec, src1_vec);
- diff34 = vsubq_s16(src2_vec, src3_vec);
- comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
- comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
- comp2 = (int16x8_t)vcgeq_s16(diff34, zeros);
- comp3 = (int16x8_t)vcltq_s16(diff34, zeros);
- comp0 = vandq_s16(src0_vec, comp0);
- comp1 = vandq_s16(src1_vec, comp1);
- comp2 = vandq_s16(src2_vec, comp2);
- comp3 = vandq_s16(src3_vec, comp3);
-
- result1_vec = vaddq_s16(comp0, comp1);
- result2_vec = vaddq_s16(comp2, comp3);
-
- diff12 = vsubq_s16(result1_vec, result2_vec);
- comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
- comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
- comp0 = vandq_s16(result1_vec, comp0);
- comp1 = vandq_s16(result2_vec, comp1);
- result1_vec = vaddq_s16(comp0, comp1);
- vst1q_s16(target, result1_vec);
- src0 += 8;
- src1 += 8;
- src2 += 8;
- src3 += 8;
- target += 8;
+ const unsigned int eighth_points = num_points / 8;
+ unsigned i;
+
+ int16x8_t src0_vec, src1_vec, src2_vec, src3_vec;
+ int16x8_t diff12, diff34;
+ int16x8_t comp0, comp1, comp2, comp3;
+ int16x8_t result1_vec, result2_vec;
+ int16x8_t zeros;
+ zeros = vdupq_n_s16(0);
+ for (i = 0; i < eighth_points; ++i) {
+ src0_vec = vld1q_s16(src0);
+ src1_vec = vld1q_s16(src1);
+ src2_vec = vld1q_s16(src2);
+ src3_vec = vld1q_s16(src3);
+ diff12 = vsubq_s16(src0_vec, src1_vec);
+ diff34 = vsubq_s16(src2_vec, src3_vec);
+ comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
+ comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
+ comp2 = (int16x8_t)vcgeq_s16(diff34, zeros);
+ comp3 = (int16x8_t)vcltq_s16(diff34, zeros);
+ comp0 = vandq_s16(src0_vec, comp0);
+ comp1 = vandq_s16(src1_vec, comp1);
+ comp2 = vandq_s16(src2_vec, comp2);
+ comp3 = vandq_s16(src3_vec, comp3);
+
+ result1_vec = vaddq_s16(comp0, comp1);
+ result2_vec = vaddq_s16(comp2, comp3);
+
+ diff12 = vsubq_s16(result1_vec, result2_vec);
+ comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
+ comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
+ comp0 = vandq_s16(result1_vec, comp0);
+ comp1 = vandq_s16(result2_vec, comp1);
+ result1_vec = vaddq_s16(comp0, comp1);
+ vst1q_s16(target, result1_vec);
+ src0 += 8;
+ src1 += 8;
+ src2 += 8;
+ src3 += 8;
+ target += 8;
}
- short temp0 = 0;
- short temp1 = 0;
- for(i=eighth_points*8; i < num_points; ++i) {
- temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1;
- temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3;
- *target++ = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
- src0++;
- src1++;
- src2++;
- src3++;
- }
+ short temp0 = 0;
+ short temp1 = 0;
+ for (i = eighth_points * 8; i < num_points; ++i) {
+ temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1;
+ temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3;
+ *target++ = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
+ src0++;
+ src1++;
+ src2++;
+ src3++;
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_16i_x4_quad_max_star_16i_generic(short* target, short* src0, short* src1,
- short* src2, short* src3, unsigned int num_points)
+static inline void volk_16i_x4_quad_max_star_16i_generic(short* target,
+ short* src0,
+ short* src1,
+ short* src2,
+ short* src3,
+ unsigned int num_points)
{
- const unsigned int num_bytes = num_points*2;
+ const unsigned int num_bytes = num_points * 2;
- int i = 0;
+ int i = 0;
- int bound = num_bytes >> 1;
+ int bound = num_bytes >> 1;
- short temp0 = 0;
- short temp1 = 0;
- for(i = 0; i < bound; ++i) {
- temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
- temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
- target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
- }
+ short temp0 = 0;
+ short temp1 = 0;
+ for (i = 0; i < bound; ++i) {
+ temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+ temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i];
+ target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
+ }
}
#endif /*LV_HAVE_GENERIC*/
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_16i_x5_add_quad_16i_x4(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points);
- * \endcode
+ * void volk_16i_x5_add_quad_16i_x4(short* target0, short* target1, short* target2, short*
+ * target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int
+ * num_points); \endcode
*
* \b Inputs
* \li src0: The input vector 0.
#ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
#define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
-#include<inttypes.h>
-#include<stdio.h>
+#include <inttypes.h>
+#include <stdio.h>
#ifdef LV_HAVE_SSE2
-#include<xmmintrin.h>
-#include<emmintrin.h>
+#include <emmintrin.h>
+#include <xmmintrin.h>
-static inline void
-volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* target1, short* target2, short* target3,
- short* src0, short* src1, short* src2, short* src3, short* src4,
- unsigned int num_points)
+static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0,
+ short* target1,
+ short* target2,
+ short* target3,
+ short* src0,
+ short* src1,
+ short* src2,
+ short* src3,
+ short* src4,
+ unsigned int num_points)
{
- const unsigned int num_bytes = num_points*2;
-
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
- __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
- p_target0 = (__m128i*)target0;
- p_target1 = (__m128i*)target1;
- p_target2 = (__m128i*)target2;
- p_target3 = (__m128i*)target3;
-
- p_src0 = (__m128i*)src0;
- p_src1 = (__m128i*)src1;
- p_src2 = (__m128i*)src2;
- p_src3 = (__m128i*)src3;
- p_src4 = (__m128i*)src4;
-
- int i = 0;
-
- int bound = (num_bytes >> 4);
- int leftovers = (num_bytes >> 1) & 7;
-
- for(; i < bound; ++i) {
- xmm0 = _mm_load_si128(p_src0);
- xmm1 = _mm_load_si128(p_src1);
- xmm2 = _mm_load_si128(p_src2);
- xmm3 = _mm_load_si128(p_src3);
- xmm4 = _mm_load_si128(p_src4);
-
- p_src0 += 1;
- p_src1 += 1;
-
- xmm1 = _mm_add_epi16(xmm0, xmm1);
- xmm2 = _mm_add_epi16(xmm0, xmm2);
- xmm3 = _mm_add_epi16(xmm0, xmm3);
- xmm4 = _mm_add_epi16(xmm0, xmm4);
-
-
- p_src2 += 1;
- p_src3 += 1;
- p_src4 += 1;
-
- _mm_store_si128(p_target0, xmm1);
- _mm_store_si128(p_target1, xmm2);
- _mm_store_si128(p_target2, xmm3);
- _mm_store_si128(p_target3, xmm4);
-
- p_target0 += 1;
- p_target1 += 1;
- p_target2 += 1;
- p_target3 += 1;
- }
- /*__VOLK_ASM __VOLK_VOLATILE
- (
- ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t"
- "cmp $0, %[bound]\n\t"
- "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t"
- "movaps (%[src0]), %%xmm1\n\t"
- "movaps (%[src1]), %%xmm2\n\t"
- "movaps (%[src2]), %%xmm3\n\t"
- "movaps (%[src3]), %%xmm4\n\t"
- "movaps (%[src4]), %%xmm5\n\t"
- "add $16, %[src0]\n\t"
- "add $16, %[src1]\n\t"
- "add $16, %[src2]\n\t"
- "add $16, %[src3]\n\t"
- "add $16, %[src4]\n\t"
- "paddw %%xmm1, %%xmm2\n\t"
- "paddw %%xmm1, %%xmm3\n\t"
- "paddw %%xmm1, %%xmm4\n\t"
- "paddw %%xmm1, %%xmm5\n\t"
- "add $-1, %[bound]\n\t"
- "movaps %%xmm2, (%[target0])\n\t"
- "movaps %%xmm3, (%[target1])\n\t"
- "movaps %%xmm4, (%[target2])\n\t"
- "movaps %%xmm5, (%[target3])\n\t"
- "add $16, %[target0]\n\t"
- "add $16, %[target1]\n\t"
- "add $16, %[target2]\n\t"
- "add $16, %[target3]\n\t"
- "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t"
- ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t"
- :
- :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3)
- :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
- */
-
- for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
- target0[i] = src0[i] + src1[i];
- target1[i] = src0[i] + src2[i];
- target2[i] = src0[i] + src3[i];
- target3[i] = src0[i] + src4[i];
- }
+ const unsigned int num_bytes = num_points * 2;
+
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+ __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2,
+ *p_src3, *p_src4;
+ p_target0 = (__m128i*)target0;
+ p_target1 = (__m128i*)target1;
+ p_target2 = (__m128i*)target2;
+ p_target3 = (__m128i*)target3;
+
+ p_src0 = (__m128i*)src0;
+ p_src1 = (__m128i*)src1;
+ p_src2 = (__m128i*)src2;
+ p_src3 = (__m128i*)src3;
+ p_src4 = (__m128i*)src4;
+
+ int i = 0;
+
+ int bound = (num_bytes >> 4);
+ int leftovers = (num_bytes >> 1) & 7;
+
+ for (; i < bound; ++i) {
+ xmm0 = _mm_load_si128(p_src0);
+ xmm1 = _mm_load_si128(p_src1);
+ xmm2 = _mm_load_si128(p_src2);
+ xmm3 = _mm_load_si128(p_src3);
+ xmm4 = _mm_load_si128(p_src4);
+
+ p_src0 += 1;
+ p_src1 += 1;
+
+ xmm1 = _mm_add_epi16(xmm0, xmm1);
+ xmm2 = _mm_add_epi16(xmm0, xmm2);
+ xmm3 = _mm_add_epi16(xmm0, xmm3);
+ xmm4 = _mm_add_epi16(xmm0, xmm4);
+
+
+ p_src2 += 1;
+ p_src3 += 1;
+ p_src4 += 1;
+
+ _mm_store_si128(p_target0, xmm1);
+ _mm_store_si128(p_target1, xmm2);
+ _mm_store_si128(p_target2, xmm3);
+ _mm_store_si128(p_target3, xmm4);
+
+ p_target0 += 1;
+ p_target1 += 1;
+ p_target2 += 1;
+ p_target3 += 1;
+ }
+ /*__VOLK_ASM __VOLK_VOLATILE
+ (
+ ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t"
+ "cmp $0, %[bound]\n\t"
+ "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t"
+ "movaps (%[src0]), %%xmm1\n\t"
+ "movaps (%[src1]), %%xmm2\n\t"
+ "movaps (%[src2]), %%xmm3\n\t"
+ "movaps (%[src3]), %%xmm4\n\t"
+ "movaps (%[src4]), %%xmm5\n\t"
+ "add $16, %[src0]\n\t"
+ "add $16, %[src1]\n\t"
+ "add $16, %[src2]\n\t"
+ "add $16, %[src3]\n\t"
+ "add $16, %[src4]\n\t"
+ "paddw %%xmm1, %%xmm2\n\t"
+ "paddw %%xmm1, %%xmm3\n\t"
+ "paddw %%xmm1, %%xmm4\n\t"
+ "paddw %%xmm1, %%xmm5\n\t"
+ "add $-1, %[bound]\n\t"
+ "movaps %%xmm2, (%[target0])\n\t"
+ "movaps %%xmm3, (%[target1])\n\t"
+ "movaps %%xmm4, (%[target2])\n\t"
+ "movaps %%xmm5, (%[target3])\n\t"
+ "add $16, %[target0]\n\t"
+ "add $16, %[target1]\n\t"
+ "add $16, %[target2]\n\t"
+ "add $16, %[target3]\n\t"
+ "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t"
+ ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t"
+ :
+ :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2),
+ [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1),
+ [target2]"r"(target2), [target3]"r"(target3)
+ :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ */
+
+ for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+ target0[i] = src0[i] + src1[i];
+ target1[i] = src0[i] + src2[i];
+ target2[i] = src0[i] + src3[i];
+ target3[i] = src0[i] + src4[i];
+ }
}
#endif /*LV_HAVE_SSE2*/
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_16i_x5_add_quad_16i_x4_neon(short* target0, short* target1, short* target2, short* target3,
- short* src0, short* src1, short* src2, short* src3, short* src4,
- unsigned int num_points)
+static inline void volk_16i_x5_add_quad_16i_x4_neon(short* target0,
+ short* target1,
+ short* target2,
+ short* target3,
+ short* src0,
+ short* src1,
+ short* src2,
+ short* src3,
+ short* src4,
+ unsigned int num_points)
{
- const unsigned int eighth_points = num_points / 8;
- unsigned int number = 0;
-
- int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec;
- int16x8_t target0_vec, target1_vec, target2_vec, target3_vec;
- for(number = 0; number < eighth_points; ++number) {
- src0_vec = vld1q_s16(src0);
- src1_vec = vld1q_s16(src1);
- src2_vec = vld1q_s16(src2);
- src3_vec = vld1q_s16(src3);
- src4_vec = vld1q_s16(src4);
-
- target0_vec = vaddq_s16(src0_vec , src1_vec);
- target1_vec = vaddq_s16(src0_vec , src2_vec);
- target2_vec = vaddq_s16(src0_vec , src3_vec);
- target3_vec = vaddq_s16(src0_vec , src4_vec);
-
- vst1q_s16(target0, target0_vec);
- vst1q_s16(target1, target1_vec);
- vst1q_s16(target2, target2_vec);
- vst1q_s16(target3, target3_vec);
- src0 += 8;
- src1 += 8;
- src2 += 8;
- src3 += 8;
- src4 += 8;
- target0 += 8;
- target1 += 8;
- target2 += 8;
- target3 += 8;
- }
-
- for(number = eighth_points * 8; number < num_points; ++number) {
- *target0++ = *src0 + *src1++;
- *target1++ = *src0 + *src2++;
- *target2++ = *src0 + *src3++;
- *target3++ = *src0++ + *src4++;
- }
+ const unsigned int eighth_points = num_points / 8;
+ unsigned int number = 0;
+
+ int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec;
+ int16x8_t target0_vec, target1_vec, target2_vec, target3_vec;
+ for (number = 0; number < eighth_points; ++number) {
+ src0_vec = vld1q_s16(src0);
+ src1_vec = vld1q_s16(src1);
+ src2_vec = vld1q_s16(src2);
+ src3_vec = vld1q_s16(src3);
+ src4_vec = vld1q_s16(src4);
+
+ target0_vec = vaddq_s16(src0_vec, src1_vec);
+ target1_vec = vaddq_s16(src0_vec, src2_vec);
+ target2_vec = vaddq_s16(src0_vec, src3_vec);
+ target3_vec = vaddq_s16(src0_vec, src4_vec);
+
+ vst1q_s16(target0, target0_vec);
+ vst1q_s16(target1, target1_vec);
+ vst1q_s16(target2, target2_vec);
+ vst1q_s16(target3, target3_vec);
+ src0 += 8;
+ src1 += 8;
+ src2 += 8;
+ src3 += 8;
+ src4 += 8;
+ target0 += 8;
+ target1 += 8;
+ target2 += 8;
+ target3 += 8;
+ }
+
+ for (number = eighth_points * 8; number < num_points; ++number) {
+ *target0++ = *src0 + *src1++;
+ *target1++ = *src0 + *src2++;
+ *target2++ = *src0 + *src3++;
+ *target3++ = *src0++ + *src4++;
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_16i_x5_add_quad_16i_x4_generic(short* target0, short* target1, short* target2, short* target3,
- short* src0, short* src1, short* src2, short* src3, short* src4,
- unsigned int num_points)
+static inline void volk_16i_x5_add_quad_16i_x4_generic(short* target0,
+ short* target1,
+ short* target2,
+ short* target3,
+ short* src0,
+ short* src1,
+ short* src2,
+ short* src3,
+ short* src4,
+ unsigned int num_points)
{
- const unsigned int num_bytes = num_points*2;
+ const unsigned int num_bytes = num_points * 2;
- int i = 0;
+ int i = 0;
- int bound = num_bytes >> 1;
+ int bound = num_bytes >> 1;
- for(i = 0; i < bound; ++i) {
- target0[i] = src0[i] + src1[i];
- target1[i] = src0[i] + src2[i];
- target2[i] = src0[i] + src3[i];
- target3[i] = src0[i] + src4[i];
- }
+ for (i = 0; i < bound; ++i) {
+ target0[i] = src0[i] + src1[i];
+ target1[i] = src0[i] + src2[i];
+ target2[i] = src0[i] + src3[i];
+ target3[i] = src0[i] + src4[i];
+ }
}
#endif /* LV_HAVE_GENERIC */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_16ic_convert_32fc(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
- * \endcode
+ * void volk_16ic_convert_32fc(lv_32fc_t* outputVector, const lv_16sc_t* inputVector,
+ * unsigned int num_points) \endcode
*
* \b Inputs
* \li inputVector: The complex 16-bit integer input data buffer.
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
+static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector,
+ const lv_16sc_t* inputVector,
+ unsigned int num_points)
{
const unsigned int avx_iters = num_points / 8;
unsigned int number = 0;
__m256i outValInt;
__m128i cplxValue;
- for(number = 0; number < avx_iters; number++)
- {
- cplxValue = _mm_load_si128((__m128i*)complexVectorPtr);
- complexVectorPtr += 8;
-
- outValInt = _mm256_cvtepi16_epi32(cplxValue);
- outVal = _mm256_cvtepi32_ps(outValInt);
- _mm256_store_ps((float*)outputVectorPtr, outVal);
+ for (number = 0; number < avx_iters; number++) {
+ cplxValue = _mm_load_si128((__m128i*)complexVectorPtr);
+ complexVectorPtr += 8;
- outputVectorPtr += 8;
- }
+ outValInt = _mm256_cvtepi16_epi32(cplxValue);
+ outVal = _mm256_cvtepi32_ps(outValInt);
+ _mm256_store_ps((float*)outputVectorPtr, outVal);
+
+ outputVectorPtr += 8;
+ }
number = avx_iters * 8;
- for(; number < num_points*2; number++)
- {
- *outputVectorPtr++ = (float)*complexVectorPtr++;
- }
+ for (; number < num_points * 2; number++) {
+ *outputVectorPtr++ = (float)*complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_GENERIC
-static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
+static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector,
+ const lv_16sc_t* inputVector,
+ unsigned int num_points)
{
unsigned int i;
- for(i = 0; i < num_points; i++)
- {
- outputVector[i] = lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i]));
- }
+ for (i = 0; i < num_points; i++) {
+ outputVector[i] =
+ lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i]));
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
+static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector,
+ const lv_16sc_t* inputVector,
+ unsigned int num_points)
{
const unsigned int sse_iters = num_points / 2;
__m128 a;
unsigned int number;
- for(number = 0; number < sse_iters; number++)
- {
- a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
- _mm_store_ps((float*)_out, a);
- _in += 2;
- _out += 2;
- }
- if (num_points & 1)
- {
- *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
- _in++;
- }
+ for (number = 0; number < sse_iters; number++) {
+ a = _mm_set_ps(
+ (float)(lv_cimag(_in[1])),
+ (float)(lv_creal(_in[1])),
+ (float)(lv_cimag(_in[0])),
+ (float)(lv_creal(
+ _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+ _mm_store_ps((float*)_out, a);
+ _in += 2;
+ _out += 2;
+ }
+ if (num_points & 1) {
+ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+ _in++;
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
+static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector,
+ const lv_16sc_t* inputVector,
+ unsigned int num_points)
{
const unsigned int sse_iters = num_points / 4;
__m256 a;
unsigned int i, number;
- for(number = 0; number < sse_iters; number++)
- {
- a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
- _mm256_store_ps((float*)_out, a);
- _in += 4;
- _out += 4;
- }
+ for (number = 0; number < sse_iters; number++) {
+ a = _mm256_set_ps(
+ (float)(lv_cimag(_in[3])),
+ (float)(lv_creal(_in[3])),
+ (float)(lv_cimag(_in[2])),
+ (float)(lv_creal(_in[2])),
+ (float)(lv_cimag(_in[1])),
+ (float)(lv_creal(_in[1])),
+ (float)(lv_cimag(_in[0])),
+ (float)(lv_creal(
+ _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+ _mm256_store_ps((float*)_out, a);
+ _in += 4;
+ _out += 4;
+ }
_mm256_zeroupper();
- for (i = 0; i < (num_points % 4); ++i)
- {
- *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
- _in++;
- }
+ for (i = 0; i < (num_points % 4); ++i) {
+ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+ _in++;
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
+static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector,
+ const lv_16sc_t* inputVector,
+ unsigned int num_points)
{
const unsigned int sse_iters = num_points / 2;
float32x4_t f32x4;
unsigned int i, number;
- for(number = 0; number < sse_iters; number++)
- {
- a16x4 = vld1_s16((const int16_t*)_in);
- __VOLK_PREFETCH(_in + 4);
- a32x4 = vmovl_s16(a16x4);
- f32x4 = vcvtq_f32_s32(a32x4);
- vst1q_f32((float32_t*)_out, f32x4);
- _in += 2;
- _out += 2;
- }
- for (i = 0; i < (num_points % 2); ++i)
- {
- *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
- _in++;
- }
+ for (number = 0; number < sse_iters; number++) {
+ a16x4 = vld1_s16((const int16_t*)_in);
+ __VOLK_PREFETCH(_in + 4);
+ a32x4 = vmovl_s16(a16x4);
+ f32x4 = vcvtq_f32_s32(a32x4);
+ vst1q_f32((float32_t*)_out, f32x4);
+ _in += 2;
+ _out += 2;
+ }
+ for (i = 0; i < (num_points % 2); ++i) {
+ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+ _in++;
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
+static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector,
+ const lv_16sc_t* inputVector,
+ unsigned int num_points)
{
const unsigned int avx_iters = num_points / 8;
unsigned int number = 0;
__m256i outValInt;
__m128i cplxValue;
- for(number = 0; number < avx_iters; number++)
- {
- cplxValue = _mm_loadu_si128((__m128i*)complexVectorPtr);
- complexVectorPtr += 8;
-
- outValInt = _mm256_cvtepi16_epi32(cplxValue);
- outVal = _mm256_cvtepi32_ps(outValInt);
- _mm256_storeu_ps((float*)outputVectorPtr, outVal);
+ for (number = 0; number < avx_iters; number++) {
+ cplxValue = _mm_loadu_si128((__m128i*)complexVectorPtr);
+ complexVectorPtr += 8;
+
+ outValInt = _mm256_cvtepi16_epi32(cplxValue);
+ outVal = _mm256_cvtepi32_ps(outValInt);
+ _mm256_storeu_ps((float*)outputVectorPtr, outVal);
- outputVectorPtr += 8;
- }
+ outputVectorPtr += 8;
+ }
number = avx_iters * 8;
- for(; number < num_points*2; number++)
- {
- *outputVectorPtr++ = (float)*complexVectorPtr++;
- }
+ for (; number < num_points * 2; number++) {
+ *outputVectorPtr++ = (float)*complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
+static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector,
+ const lv_16sc_t* inputVector,
+ unsigned int num_points)
{
const unsigned int sse_iters = num_points / 2;
__m128 a;
unsigned int number;
- for(number = 0; number < sse_iters; number++)
- {
- a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
- _mm_storeu_ps((float*)_out, a);
- _in += 2;
- _out += 2;
- }
- if (num_points & 1)
- {
- *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
- _in++;
- }
+ for (number = 0; number < sse_iters; number++) {
+ a = _mm_set_ps(
+ (float)(lv_cimag(_in[1])),
+ (float)(lv_creal(_in[1])),
+ (float)(lv_cimag(_in[0])),
+ (float)(lv_creal(
+ _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+ _mm_storeu_ps((float*)_out, a);
+ _in += 2;
+ _out += 2;
+ }
+ if (num_points & 1) {
+ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+ _in++;
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
+static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector,
+ const lv_16sc_t* inputVector,
+ unsigned int num_points)
{
const unsigned int sse_iters = num_points / 4;
__m256 a;
unsigned int i, number;
- for(number = 0; number < sse_iters; number++)
- {
- a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
- _mm256_storeu_ps((float*)_out, a);
- _in += 4;
- _out += 4;
- }
+ for (number = 0; number < sse_iters; number++) {
+ a = _mm256_set_ps(
+ (float)(lv_cimag(_in[3])),
+ (float)(lv_creal(_in[3])),
+ (float)(lv_cimag(_in[2])),
+ (float)(lv_creal(_in[2])),
+ (float)(lv_cimag(_in[1])),
+ (float)(lv_creal(_in[1])),
+ (float)(lv_cimag(_in[0])),
+ (float)(lv_creal(
+ _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+ _mm256_storeu_ps((float*)_out, a);
+ _in += 4;
+ _out += 4;
+ }
_mm256_zeroupper();
- for (i = 0; i < (num_points % 4); ++i)
- {
- *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
- _in++;
- }
+ for (i = 0; i < (num_points % 4); ++i) {
+ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+ _in++;
+ }
}
#endif /* LV_HAVE_AVX */
#endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
-
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_16ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_16ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t*
+ * complexVector, unsigned int num_points) \endcode
*
* \b Inputs
* \li complexVector: The complex input vector.
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_16ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer,
+ int16_t* qBuffer,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int8_t* complexVectorPtr = (int8_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
- int16_t* qBufferPtr = qBuffer;
-
- __m256i MoveMask = _mm256_set_epi8(15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0, 15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0);
-
- __m256i iMove2, iMove1;
- __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
-
- unsigned int sixteenthPoints = num_points / 16;
-
- for(number = 0; number < sixteenthPoints; number++){
- complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
- complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
-
- iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
- iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
-
- iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x08),_mm256_permute4x64_epi64(iMove2,0x80),0x30);
- qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x0d),_mm256_permute4x64_epi64(iMove2,0xd0),0x30);
-
- _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
- _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
-
- iBufferPtr += 16;
- qBufferPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
- for(; number < num_points; number++){
- *iBufferPtr++ = *int16ComplexVectorPtr++;
- *qBufferPtr++ = *int16ComplexVectorPtr++;
- }
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+
+ __m256i MoveMask = _mm256_set_epi8(15,
+ 14,
+ 11,
+ 10,
+ 7,
+ 6,
+ 3,
+ 2,
+ 13,
+ 12,
+ 9,
+ 8,
+ 5,
+ 4,
+ 1,
+ 0,
+ 15,
+ 14,
+ 11,
+ 10,
+ 7,
+ 6,
+ 3,
+ 2,
+ 13,
+ 12,
+ 9,
+ 8,
+ 5,
+ 4,
+ 1,
+ 0);
+
+ __m256i iMove2, iMove1;
+ __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
+
+ unsigned int sixteenthPoints = num_points / 16;
+
+ for (number = 0; number < sixteenthPoints; number++) {
+ complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+ complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+
+ iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
+ iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
+
+ iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08),
+ _mm256_permute4x64_epi64(iMove2, 0x80),
+ 0x30);
+ qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d),
+ _mm256_permute4x64_epi64(iMove2, 0xd0),
+ 0x30);
+
+ _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
+ _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
+
+ iBufferPtr += 16;
+ qBufferPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = *int16ComplexVectorPtr++;
+ *qBufferPtr++ = *int16ComplexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSSE3
#include <tmmintrin.h>
-static inline void
-volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer,
+ int16_t* qBuffer,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int8_t* complexVectorPtr = (int8_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
- int16_t* qBufferPtr = qBuffer;
-
- __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
- __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-
- __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
- __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-
- __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
-
- unsigned int eighthPoints = num_points / 8;
-
- for(number = 0; number < eighthPoints; number++){
- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
-
- iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2));
- qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2));
-
- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
- _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
-
- iBufferPtr += 8;
- qBufferPtr += 8;
- }
-
- number = eighthPoints * 8;
- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
- for(; number < num_points; number++){
- *iBufferPtr++ = *int16ComplexVectorPtr++;
- *qBufferPtr++ = *int16ComplexVectorPtr++;
- }
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+
+ __m128i iMoveMask1 = _mm_set_epi8(
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+ __m128i iMoveMask2 = _mm_set_epi8(
+ 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+ __m128i qMoveMask1 = _mm_set_epi8(
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
+ __m128i qMoveMask2 = _mm_set_epi8(
+ 15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+ __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for (number = 0; number < eighthPoints; number++) {
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
+ complexVectorPtr += 16;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
+ complexVectorPtr += 16;
+
+ iOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, iMoveMask1),
+ _mm_shuffle_epi8(complexVal2, iMoveMask2));
+ qOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, qMoveMask1),
+ _mm_shuffle_epi8(complexVal2, qMoveMask2));
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+ _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = *int16ComplexVectorPtr++;
+ *qBufferPtr++ = *int16ComplexVectorPtr++;
+ }
}
#endif /* LV_HAVE_SSSE3 */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void
-volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer,
+ int16_t* qBuffer,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int16_t* complexVectorPtr = (int16_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
- int16_t* qBufferPtr = qBuffer;
- __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal;
- __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
- __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1,
+ qComplexVal2, iOutputVal, qOutputVal;
+ __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+ __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
- unsigned int eighthPoints = num_points / 8;
+ unsigned int eighthPoints = num_points / 8;
- for(number = 0; number < eighthPoints; number++){
- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+ for (number = 0; number < eighthPoints; number++) {
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
+ complexVectorPtr += 8;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
+ complexVectorPtr += 8;
- iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+ iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
- iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
+ iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
- iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
+ iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
- iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+ iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
- iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0));
+ iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3, 1, 2, 0));
- iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1));
+ iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
- iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask));
+ iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask),
+ _mm_and_si128(iComplexVal2, highMask));
- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
- qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1));
+ qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2, 0, 3, 1));
- qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1));
+ qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2, 0, 3, 1));
- qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0));
+ qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
- qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1));
+ qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2, 0, 3, 1));
- qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
+ qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
- qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
+ qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
- qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask));
+ qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask),
+ _mm_and_si128(qComplexVal2, highMask));
- _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+ _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
- iBufferPtr += 8;
- qBufferPtr += 8;
- }
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- *qBufferPtr++ = *complexVectorPtr++;
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer,
+ int16_t* qBuffer,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
{
- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
- int16_t* qBufferPtr = qBuffer;
- unsigned int number;
- for(number = 0; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- *qBufferPtr++ = *complexVectorPtr++;
- }
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ unsigned int number;
+ for (number = 0; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC
-extern void
-volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
-static inline void
-volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+extern void volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer,
+ int16_t* qBuffer,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points);
+static inline void volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer,
+ int16_t* qBuffer,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
{
- volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points);
+ volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points);
}
#endif /* LV_HAVE_ORC */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
+ int16_t* qBuffer,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int8_t* complexVectorPtr = (int8_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
- int16_t* qBufferPtr = qBuffer;
-
- __m256i MoveMask = _mm256_set_epi8(15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0, 15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0);
-
- __m256i iMove2, iMove1;
- __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
-
- unsigned int sixteenthPoints = num_points / 16;
-
- for(number = 0; number < sixteenthPoints; number++){
- complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
- complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
-
- iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
- iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
-
- iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x08),_mm256_permute4x64_epi64(iMove2,0x80),0x30);
- qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x0d),_mm256_permute4x64_epi64(iMove2,0xd0),0x30);
-
- _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
- _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
-
- iBufferPtr += 16;
- qBufferPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
- for(; number < num_points; number++){
- *iBufferPtr++ = *int16ComplexVectorPtr++;
- *qBufferPtr++ = *int16ComplexVectorPtr++;
- }
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+
+ __m256i MoveMask = _mm256_set_epi8(15,
+ 14,
+ 11,
+ 10,
+ 7,
+ 6,
+ 3,
+ 2,
+ 13,
+ 12,
+ 9,
+ 8,
+ 5,
+ 4,
+ 1,
+ 0,
+ 15,
+ 14,
+ 11,
+ 10,
+ 7,
+ 6,
+ 3,
+ 2,
+ 13,
+ 12,
+ 9,
+ 8,
+ 5,
+ 4,
+ 1,
+ 0);
+
+ __m256i iMove2, iMove1;
+ __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
+
+ unsigned int sixteenthPoints = num_points / 16;
+
+ for (number = 0; number < sixteenthPoints; number++) {
+ complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+ complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+
+ iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
+ iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
+
+ iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08),
+ _mm256_permute4x64_epi64(iMove2, 0x80),
+ 0x30);
+ qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d),
+ _mm256_permute4x64_epi64(iMove2, 0xd0),
+ 0x30);
+
+ _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
+ _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
+
+ iBufferPtr += 16;
+ qBufferPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = *int16ComplexVectorPtr++;
+ *qBufferPtr++ = *int16ComplexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
*
* \b Overview
*
- * Deinterleaves the complex 16 bit vector and returns the real (inphase) part of the signal.
+ * Deinterleaves the complex 16 bit vector and returns the real (inphase) part of the
+ * signal.
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_16ic_deinterleave_real_16i(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_16ic_deinterleave_real_16i(int16_t* iBuffer, const lv_16sc_t* complexVector,
+ * unsigned int num_points) \endcode
*
* \b Inputs
* \li complexVector: The complex input vector.
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_16ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int16_t* complexVectorPtr = (int16_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
-
- __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
- __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-
- __m256i complexVal1, complexVal2, iOutputVal;
-
- unsigned int sixteenthPoints = num_points / 16;
-
- for(number = 0; number < sixteenthPoints; number++){
- complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 16;
- complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 16;
-
- complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
- complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
-
- iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
- iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
-
- _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
-
- iBufferPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- complexVectorPtr++;
- }
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+
+ __m256i iMoveMask1 = _mm256_set_epi8(0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 13,
+ 12,
+ 9,
+ 8,
+ 5,
+ 4,
+ 1,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 13,
+ 12,
+ 9,
+ 8,
+ 5,
+ 4,
+ 1,
+ 0);
+ __m256i iMoveMask2 = _mm256_set_epi8(13,
+ 12,
+ 9,
+ 8,
+ 5,
+ 4,
+ 1,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 13,
+ 12,
+ 9,
+ 8,
+ 5,
+ 4,
+ 1,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80);
+
+ __m256i complexVal1, complexVal2, iOutputVal;
+
+ unsigned int sixteenthPoints = num_points / 16;
+
+ for (number = 0; number < sixteenthPoints; number++) {
+ complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 16;
+ complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 16;
+
+ complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
+ complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
+
+ iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
+ iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
+
+ _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
+
+ iBufferPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSSE3
#include <tmmintrin.h>
-static inline void
-volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int16_t* complexVectorPtr = (int16_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
- __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
- __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+ __m128i iMoveMask1 = _mm_set_epi8(
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+ __m128i iMoveMask2 = _mm_set_epi8(
+ 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
- __m128i complexVal1, complexVal2, iOutputVal;
+ __m128i complexVal1, complexVal2, iOutputVal;
- unsigned int eighthPoints = num_points / 8;
+ unsigned int eighthPoints = num_points / 8;
- for(number = 0; number < eighthPoints; number++){
- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+ for (number = 0; number < eighthPoints; number++) {
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
+ complexVectorPtr += 8;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
+ complexVectorPtr += 8;
- complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
- complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+ complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+ complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
- iOutputVal = _mm_or_si128(complexVal1, complexVal2);
+ iOutputVal = _mm_or_si128(complexVal1, complexVal2);
- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
- iBufferPtr += 8;
- }
+ iBufferPtr += 8;
+ }
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- complexVectorPtr++;
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_SSSE3 */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void
-volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int16_t* complexVectorPtr = (int16_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
- __m128i complexVal1, complexVal2, iOutputVal;
- __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
- __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ __m128i complexVal1, complexVal2, iOutputVal;
+ __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+ __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
- unsigned int eighthPoints = num_points / 8;
+ unsigned int eighthPoints = num_points / 8;
- for(number = 0; number < eighthPoints; number++){
- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+ for (number = 0; number < eighthPoints; number++) {
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
+ complexVectorPtr += 8;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
+ complexVectorPtr += 8;
- complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+ complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
- complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+ complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
- complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0));
+ complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
- complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+ complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
- complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+ complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
- complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1));
+ complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2, 0, 3, 1));
- iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask));
+ iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask),
+ _mm_and_si128(complexVal2, highMask));
- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
- iBufferPtr += 8;
- }
+ iBufferPtr += 8;
+ }
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- complexVectorPtr++;
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int16_t* complexVectorPtr = (int16_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
- for(number = 0; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- complexVectorPtr++;
- }
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ for (number = 0; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_16ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int16_t* complexVectorPtr = (int16_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
-
- __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
- __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-
- __m256i complexVal1, complexVal2, iOutputVal;
-
- unsigned int sixteenthPoints = num_points / 16;
-
- for(number = 0; number < sixteenthPoints; number++){
- complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 16;
- complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 16;
-
- complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
- complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
-
- iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
- iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
-
- _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
-
- iBufferPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- complexVectorPtr++;
- }
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+
+ __m256i iMoveMask1 = _mm256_set_epi8(0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 13,
+ 12,
+ 9,
+ 8,
+ 5,
+ 4,
+ 1,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 13,
+ 12,
+ 9,
+ 8,
+ 5,
+ 4,
+ 1,
+ 0);
+ __m256i iMoveMask2 = _mm256_set_epi8(13,
+ 12,
+ 9,
+ 8,
+ 5,
+ 4,
+ 1,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 13,
+ 12,
+ 9,
+ 8,
+ 5,
+ 4,
+ 1,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80);
+
+ __m256i complexVal1, complexVal2, iOutputVal;
+
+ unsigned int sixteenthPoints = num_points / 16;
+
+ for (number = 0; number < sixteenthPoints; number++) {
+ complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 16;
+ complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 16;
+
+ complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
+ complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
+
+ iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
+ iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
+
+ _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
+
+ iBufferPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_16ic_deinterleave_real_8i(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_16ic_deinterleave_real_8i(int8_t* iBuffer, const lv_16sc_t* complexVector,
+ * unsigned int num_points) \endcode
*
* \b Inputs
* \li complexVector: The complex input vector.
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int8_t* complexVectorPtr = (int8_t*)complexVector;
- int8_t* iBufferPtr = iBuffer;
- __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
- __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
- __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
-
- unsigned int thirtysecondPoints = num_points / 32;
-
- for(number = 0; number < thirtysecondPoints; number++){
- complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
- complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
-
- complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
- complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
-
- complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
- complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
-
- complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
- complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
-
- complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
- complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
-
- complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
- complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
-
- complexVal1 = _mm256_srai_epi16(complexVal1, 8);
- complexVal3 = _mm256_srai_epi16(complexVal3, 8);
-
- iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
- iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
-
- _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
-
- iBufferPtr += 32;
- }
-
- number = thirtysecondPoints * 32;
- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
- for(; number < num_points; number++){
- *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
- int16ComplexVectorPtr++;
- }
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ __m256i iMoveMask1 = _mm256_set_epi8(0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 13,
+ 12,
+ 9,
+ 8,
+ 5,
+ 4,
+ 1,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 13,
+ 12,
+ 9,
+ 8,
+ 5,
+ 4,
+ 1,
+ 0);
+ __m256i iMoveMask2 = _mm256_set_epi8(13,
+ 12,
+ 9,
+ 8,
+ 5,
+ 4,
+ 1,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 13,
+ 12,
+ 9,
+ 8,
+ 5,
+ 4,
+ 1,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80);
+ __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
+
+ unsigned int thirtysecondPoints = num_points / 32;
+
+ for (number = 0; number < thirtysecondPoints; number++) {
+ complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+ complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+
+ complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+ complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+
+ complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
+ complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
+
+ complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
+ complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
+
+ complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
+ complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
+
+ complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
+ complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
+
+ complexVal1 = _mm256_srai_epi16(complexVal1, 8);
+ complexVal3 = _mm256_srai_epi16(complexVal3, 8);
+
+ iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
+ iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
+
+ _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
+
+ iBufferPtr += 32;
+ }
+
+ number = thirtysecondPoints * 32;
+ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
+ int16ComplexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSSE3
#include <tmmintrin.h>
-static inline void
-volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int8_t* complexVectorPtr = (int8_t*)complexVector;
- int8_t* iBufferPtr = iBuffer;
- __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
- __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
- __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ __m128i iMoveMask1 = _mm_set_epi8(
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+ __m128i iMoveMask2 = _mm_set_epi8(
+ 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+ __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
- unsigned int sixteenthPoints = num_points / 16;
+ unsigned int sixteenthPoints = num_points / 16;
- for(number = 0; number < sixteenthPoints; number++){
- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ for (number = 0; number < sixteenthPoints; number++) {
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
+ complexVectorPtr += 16;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
+ complexVectorPtr += 16;
- complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
- complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr);
+ complexVectorPtr += 16;
+ complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr);
+ complexVectorPtr += 16;
- complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
- complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+ complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+ complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
- complexVal1 = _mm_or_si128(complexVal1, complexVal2);
+ complexVal1 = _mm_or_si128(complexVal1, complexVal2);
- complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
- complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
+ complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
+ complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
- complexVal3 = _mm_or_si128(complexVal3, complexVal4);
+ complexVal3 = _mm_or_si128(complexVal3, complexVal4);
- complexVal1 = _mm_srai_epi16(complexVal1, 8);
- complexVal3 = _mm_srai_epi16(complexVal3, 8);
+ complexVal1 = _mm_srai_epi16(complexVal1, 8);
+ complexVal3 = _mm_srai_epi16(complexVal3, 8);
- iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
+ iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
- iBufferPtr += 16;
- }
+ iBufferPtr += 16;
+ }
- number = sixteenthPoints * 16;
- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
- for(; number < num_points; number++){
- *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
- int16ComplexVectorPtr++;
- }
+ number = sixteenthPoints * 16;
+ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
+ int16ComplexVectorPtr++;
+ }
}
#endif /* LV_HAVE_SSSE3 */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- int16_t* complexVectorPtr = (int16_t*)complexVector;
- int8_t* iBufferPtr = iBuffer;
- for(number = 0; number < num_points; number++){
- *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
- complexVectorPtr++;
- }
+ unsigned int number = 0;
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ for (number = 0; number < num_points; number++) {
+ *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_16ic_deinterleave_real_8i_neon(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_real_8i_neon(int8_t* iBuffer,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
{
- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
- int8_t* iBufferPtr = iBuffer;
- unsigned int eighth_points = num_points / 8;
- unsigned int number;
-
- int16x8x2_t complexInput;
- int8x8_t realOutput;
- for(number = 0; number < eighth_points; number++){
- complexInput = vld2q_s16(complexVectorPtr);
- realOutput = vshrn_n_s16(complexInput.val[0], 8);
- vst1_s8(iBufferPtr, realOutput);
- complexVectorPtr += 16;
- iBufferPtr += 8;
- }
-
- for(number = eighth_points*8; number < num_points; number++){
- *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
- complexVectorPtr++;
- }
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ unsigned int eighth_points = num_points / 8;
+ unsigned int number;
+
+ int16x8x2_t complexInput;
+ int8x8_t realOutput;
+ for (number = 0; number < eighth_points; number++) {
+ complexInput = vld2q_s16(complexVectorPtr);
+ realOutput = vshrn_n_s16(complexInput.val[0], 8);
+ vst1_s8(iBufferPtr, realOutput);
+ complexVectorPtr += 16;
+ iBufferPtr += 8;
+ }
+
+ for (number = eighth_points * 8; number < num_points; number++) {
+ *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
+ complexVectorPtr++;
+ }
}
#endif
#ifdef LV_HAVE_ORC
-extern void
-volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
+extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points);
-static inline void
-volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
{
volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points);
}
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int8_t* complexVectorPtr = (int8_t*)complexVector;
- int8_t* iBufferPtr = iBuffer;
- __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
- __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
- __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
-
- unsigned int thirtysecondPoints = num_points / 32;
-
- for(number = 0; number < thirtysecondPoints; number++){
- complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
- complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
-
- complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
- complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
-
- complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
- complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
-
- complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
- complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
-
- complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
- complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
-
- complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
- complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
-
- complexVal1 = _mm256_srai_epi16(complexVal1, 8);
- complexVal3 = _mm256_srai_epi16(complexVal3, 8);
-
- iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
- iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
-
- _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
-
- iBufferPtr += 32;
- }
-
- number = thirtysecondPoints * 32;
- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
- for(; number < num_points; number++){
- *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
- int16ComplexVectorPtr++;
- }
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ __m256i iMoveMask1 = _mm256_set_epi8(0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 13,
+ 12,
+ 9,
+ 8,
+ 5,
+ 4,
+ 1,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 13,
+ 12,
+ 9,
+ 8,
+ 5,
+ 4,
+ 1,
+ 0);
+ __m256i iMoveMask2 = _mm256_set_epi8(13,
+ 12,
+ 9,
+ 8,
+ 5,
+ 4,
+ 1,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 13,
+ 12,
+ 9,
+ 8,
+ 5,
+ 4,
+ 1,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80);
+ __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
+
+ unsigned int thirtysecondPoints = num_points / 32;
+
+ for (number = 0; number < thirtysecondPoints; number++) {
+ complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+ complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+
+ complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+ complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+
+ complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
+ complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
+
+ complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
+ complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
+
+ complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
+ complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
+
+ complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
+ complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
+
+ complexVal1 = _mm256_srai_epi16(complexVal1, 8);
+ complexVal3 = _mm256_srai_epi16(complexVal3, 8);
+
+ iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
+ iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
+
+ _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
+
+ iBufferPtr += 32;
+ }
+
+ number = thirtysecondPoints * 32;
+ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
+ int16ComplexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_u_H */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_16ic_magnitude_16i(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_16ic_magnitude_16i(int16_t* magnitudeVector, const lv_16sc_t* complexVector,
+ * unsigned int num_points) \endcode
*
* \b Inputs
* \li complexVector: The complex input vector.
#ifndef INCLUDED_volk_16ic_magnitude_16i_a_H
#define INCLUDED_volk_16ic_magnitude_16i_a_H
-#include <volk/volk_common.h>
#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_16ic_magnitude_16i_a_avx2(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_magnitude_16i_a_avx2(int16_t* magnitudeVector,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
- int16_t* magnitudeVectorPtr = magnitudeVector;
-
- __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
- __m256 invScalar = _mm256_set1_ps(1.0f/SHRT_MAX);
- __m256i int1, int2;
- __m128i short1, short2;
- __m256 cplxValue1, cplxValue2, result;
- __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0);
-
- for(;number < eighthPoints; number++){
-
- int1 = _mm256_load_si256((__m256i*)complexVectorPtr);
- complexVectorPtr += 16;
- short1 = _mm256_extracti128_si256(int1,0);
- short2 = _mm256_extracti128_si256(int1,1);
-
- int1 = _mm256_cvtepi16_epi32(short1);
- int2 = _mm256_cvtepi16_epi32(short2);
- cplxValue1 = _mm256_cvtepi32_ps(int1);
- cplxValue2 = _mm256_cvtepi32_ps(int2);
-
- cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
- cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
-
- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
- result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
- result = _mm256_sqrt_ps(result); // Square root the values
-
- result = _mm256_mul_ps(result, vScalar); // Scale the results
-
- int1 = _mm256_cvtps_epi32(result);
- int1 = _mm256_packs_epi32(int1, int1);
- int1 = _mm256_permutevar8x32_epi32(int1, idx); //permute to compensate for shuffling in hadd and packs
- short1 = _mm256_extracti128_si256(int1, 0);
- _mm_store_si128((__m128i*)magnitudeVectorPtr,short1);
- magnitudeVectorPtr += 8;
- }
-
- number = eighthPoints * 8;
- magnitudeVectorPtr = &magnitudeVector[number];
- complexVectorPtr = (const int16_t*)&complexVector[number];
- for(; number < num_points; number++){
- const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
- const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
- const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
- *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
- }
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+
+ __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
+ __m256 invScalar = _mm256_set1_ps(1.0f / SHRT_MAX);
+ __m256i int1, int2;
+ __m128i short1, short2;
+ __m256 cplxValue1, cplxValue2, result;
+ __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
+
+ for (; number < eighthPoints; number++) {
+
+ int1 = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 16;
+ short1 = _mm256_extracti128_si256(int1, 0);
+ short2 = _mm256_extracti128_si256(int1, 1);
+
+ int1 = _mm256_cvtepi16_epi32(short1);
+ int2 = _mm256_cvtepi16_epi32(short2);
+ cplxValue1 = _mm256_cvtepi32_ps(int1);
+ cplxValue2 = _mm256_cvtepi32_ps(int2);
+
+ cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
+
+ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm256_sqrt_ps(result); // Square root the values
+
+ result = _mm256_mul_ps(result, vScalar); // Scale the results
+
+ int1 = _mm256_cvtps_epi32(result);
+ int1 = _mm256_packs_epi32(int1, int1);
+ int1 = _mm256_permutevar8x32_epi32(
+ int1, idx); // permute to compensate for shuffling in hadd and packs
+ short1 = _mm256_extracti128_si256(int1, 0);
+ _mm_store_si128((__m128i*)magnitudeVectorPtr, short1);
+ magnitudeVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for (; number < num_points; number++) {
+ const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
+ const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
+ const float val1Result =
+ sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
+ *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
-static inline void
-volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
- int16_t* magnitudeVectorPtr = magnitudeVector;
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
- __m128 vScalar = _mm_set_ps1(SHRT_MAX);
- __m128 invScalar = _mm_set_ps1(1.0f/SHRT_MAX);
+ __m128 vScalar = _mm_set_ps1(SHRT_MAX);
+ __m128 invScalar = _mm_set_ps1(1.0f / SHRT_MAX);
- __m128 cplxValue1, cplxValue2, result;
+ __m128 cplxValue1, cplxValue2, result;
- __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
- for(;number < quarterPoints; number++){
+ for (; number < quarterPoints; number++) {
- inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
- inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
- inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
- inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
- inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
- inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
- inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
- inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+ inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+ inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+ inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+ inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
- cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
- cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
- complexVectorPtr += 8;
+ complexVectorPtr += 8;
- cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
- cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
- result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
- result = _mm_sqrt_ps(result); // Square root the values
+ result = _mm_sqrt_ps(result); // Square root the values
- result = _mm_mul_ps(result, vScalar); // Scale the results
+ result = _mm_mul_ps(result, vScalar); // Scale the results
- _mm_store_ps(outputFloatBuffer, result);
- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
- }
+ _mm_store_ps(outputFloatBuffer, result);
+ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+ }
- number = quarterPoints * 4;
- magnitudeVectorPtr = &magnitudeVector[number];
- complexVectorPtr = (const int16_t*)&complexVector[number];
- for(; number < num_points; number++){
- const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
- const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
- const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
- *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
- }
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for (; number < num_points; number++) {
+ const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
+ const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
+ const float val1Result =
+ sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
+ *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
+ }
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
- int16_t* magnitudeVectorPtr = magnitudeVector;
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
- __m128 vScalar = _mm_set_ps1(SHRT_MAX);
- __m128 invScalar = _mm_set_ps1(1.0f/SHRT_MAX);
+ __m128 vScalar = _mm_set_ps1(SHRT_MAX);
+ __m128 invScalar = _mm_set_ps1(1.0f / SHRT_MAX);
- __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
- __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4];
- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
- for(;number < quarterPoints; number++){
+ for (; number < quarterPoints; number++) {
- inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
- inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
- inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
- inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
- cplxValue1 = _mm_load_ps(inputFloatBuffer);
- complexVectorPtr += 4;
+ cplxValue1 = _mm_load_ps(inputFloatBuffer);
+ complexVectorPtr += 4;
- inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
- inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
- inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
- inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
- cplxValue2 = _mm_load_ps(inputFloatBuffer);
- complexVectorPtr += 4;
+ cplxValue2 = _mm_load_ps(inputFloatBuffer);
+ complexVectorPtr += 4;
- cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
- cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
- // Arrange in i1i2i3i4 format
- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
- // Arrange in q1q2q3q4 format
- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
- iValue = _mm_mul_ps(iValue, iValue); // Square the I values
- qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
- result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
- result = _mm_sqrt_ps(result); // Square root the values
+ result = _mm_sqrt_ps(result); // Square root the values
- result = _mm_mul_ps(result, vScalar); // Scale the results
+ result = _mm_mul_ps(result, vScalar); // Scale the results
- _mm_store_ps(outputFloatBuffer, result);
- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
- }
+ _mm_store_ps(outputFloatBuffer, result);
+ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+ }
- number = quarterPoints * 4;
- magnitudeVectorPtr = &magnitudeVector[number];
- complexVectorPtr = (const int16_t*)&complexVector[number];
- for(; number < num_points; number++){
- const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
- const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
- const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
- *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
- }
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for (; number < num_points; number++) {
+ const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
+ const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
+ const float val1Result =
+ sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
+ *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_16ic_magnitude_16i_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_magnitude_16i_generic(int16_t* magnitudeVector,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
{
- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
- int16_t* magnitudeVectorPtr = magnitudeVector;
- unsigned int number = 0;
- const float scalar = SHRT_MAX;
- for(number = 0; number < num_points; number++){
- float real = ((float)(*complexVectorPtr++)) / scalar;
- float imag = ((float)(*complexVectorPtr++)) / scalar;
- *magnitudeVectorPtr++ = (int16_t)rintf(sqrtf((real*real) + (imag*imag)) * scalar);
- }
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ const float scalar = SHRT_MAX;
+ for (number = 0; number < num_points; number++) {
+ float real = ((float)(*complexVectorPtr++)) / scalar;
+ float imag = ((float)(*complexVectorPtr++)) / scalar;
+ *magnitudeVectorPtr++ =
+ (int16_t)rintf(sqrtf((real * real) + (imag * imag)) * scalar);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC_DISABLED
-extern void
-volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points);
-
-static inline void
-volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
+extern void volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector,
+ const lv_16sc_t* complexVector,
+ float scalar,
+ unsigned int num_points);
+
+static inline void volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
{
- volk_16ic_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, SHRT_MAX, num_points);
+ volk_16ic_magnitude_16i_a_orc_impl(
+ magnitudeVector, complexVector, SHRT_MAX, num_points);
}
#endif /* LV_HAVE_ORC */
#ifndef INCLUDED_volk_16ic_magnitude_16i_u_H
#define INCLUDED_volk_16ic_magnitude_16i_u_H
-#include <volk/volk_common.h>
#include <inttypes.h>
-#include <stdio.h>
#include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
- int16_t* magnitudeVectorPtr = magnitudeVector;
-
- __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
- __m256 invScalar = _mm256_set1_ps(1.0f/SHRT_MAX);
- __m256i int1, int2;
- __m128i short1, short2;
- __m256 cplxValue1, cplxValue2, result;
- __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0);
-
- for(;number < eighthPoints; number++){
-
- int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
- complexVectorPtr += 16;
- short1 = _mm256_extracti128_si256(int1,0);
- short2 = _mm256_extracti128_si256(int1,1);
-
- int1 = _mm256_cvtepi16_epi32(short1);
- int2 = _mm256_cvtepi16_epi32(short2);
- cplxValue1 = _mm256_cvtepi32_ps(int1);
- cplxValue2 = _mm256_cvtepi32_ps(int2);
-
- cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
- cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
-
- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
- result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
- result = _mm256_sqrt_ps(result); // Square root the values
-
- result = _mm256_mul_ps(result, vScalar); // Scale the results
-
- int1 = _mm256_cvtps_epi32(result);
- int1 = _mm256_packs_epi32(int1, int1);
- int1 = _mm256_permutevar8x32_epi32(int1, idx); //permute to compensate for shuffling in hadd and packs
- short1 = _mm256_extracti128_si256(int1, 0);
- _mm_storeu_si128((__m128i*)magnitudeVectorPtr,short1);
- magnitudeVectorPtr += 8;
- }
-
- number = eighthPoints * 8;
- magnitudeVectorPtr = &magnitudeVector[number];
- complexVectorPtr = (const int16_t*)&complexVector[number];
- for(; number < num_points; number++){
- const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
- const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
- const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
- *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
- }
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+
+ __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
+ __m256 invScalar = _mm256_set1_ps(1.0f / SHRT_MAX);
+ __m256i int1, int2;
+ __m128i short1, short2;
+ __m256 cplxValue1, cplxValue2, result;
+ __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
+
+ for (; number < eighthPoints; number++) {
+
+ int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 16;
+ short1 = _mm256_extracti128_si256(int1, 0);
+ short2 = _mm256_extracti128_si256(int1, 1);
+
+ int1 = _mm256_cvtepi16_epi32(short1);
+ int2 = _mm256_cvtepi16_epi32(short2);
+ cplxValue1 = _mm256_cvtepi32_ps(int1);
+ cplxValue2 = _mm256_cvtepi32_ps(int2);
+
+ cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
+
+ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm256_sqrt_ps(result); // Square root the values
+
+ result = _mm256_mul_ps(result, vScalar); // Scale the results
+
+ int1 = _mm256_cvtps_epi32(result);
+ int1 = _mm256_packs_epi32(int1, int1);
+ int1 = _mm256_permutevar8x32_epi32(
+ int1, idx); // permute to compensate for shuffling in hadd and packs
+ short1 = _mm256_extracti128_si256(int1, 0);
+ _mm_storeu_si128((__m128i*)magnitudeVectorPtr, short1);
+ magnitudeVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for (; number < num_points; number++) {
+ const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
+ const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
+ const float val1Result =
+ sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
+ *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
+ }
}
#endif /* LV_HAVE_AVX2 */
#include <arm_neon.h>
#include <volk/volk_neon_intrinsics.h>
-static inline void
-volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
+static inline void volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
{
unsigned int number = 0;
unsigned int quarter_points = num_points / 4;
-
+
const float scalar = SHRT_MAX;
const float inv_scalar = 1.0f / scalar;
-
+
int16_t* magnitudeVectorPtr = magnitudeVector;
const lv_16sc_t* complexVectorPtr = complexVector;
-
+
float32x4_t mag_vec;
float32x4x2_t c_vec;
-
- for(number = 0; number < quarter_points; number++) {
+
+ for (number = 0; number < quarter_points; number++) {
const int16x4x2_t c16_vec = vld2_s16((int16_t*)complexVectorPtr);
- __VOLK_PREFETCH(complexVectorPtr+4);
+ __VOLK_PREFETCH(complexVectorPtr + 4);
c_vec.val[0] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[0]));
c_vec.val[1] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[1]));
// Scale to close to 0-1
const int16x4_t mag16_vec = vmovn_s32(vcvtq_s32_f32(mag_vec));
vst1_s16(magnitudeVectorPtr, mag16_vec);
// Advance pointers
- magnitudeVectorPtr+=4;
- complexVectorPtr+=4;
+ magnitudeVectorPtr += 4;
+ complexVectorPtr += 4;
}
-
+
// Deal with the rest
- for(number = quarter_points * 4; number < num_points; number++) {
+ for (number = quarter_points * 4; number < num_points; number++) {
const float real = lv_creal(*complexVectorPtr) * inv_scalar;
const float imag = lv_cimag(*complexVectorPtr) * inv_scalar;
- *magnitudeVectorPtr = (int16_t)rintf(sqrtf((real*real) + (imag*imag)) * scalar);
+ *magnitudeVectorPtr =
+ (int16_t)rintf(sqrtf((real * real) + (imag * imag)) * scalar);
complexVectorPtr++;
magnitudeVectorPtr++;
}
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_16ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
- * \endcode
+ * void volk_16ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const
+ * lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ \endcode
*
* \b Inputs
* \li complexVector: The complex input vector of 16-bit shorts.
#ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
#define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
-#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline
-void volk_16ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
- const float scalar, unsigned int num_points)
+static inline void
+volk_16ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer,
+ float* qBuffer,
+ const lv_16sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- float* iBufferPtr = iBuffer;
- float* qBufferPtr = qBuffer;
-
- uint64_t number = 0;
- const uint64_t eighthPoints = num_points / 8;
- __m256 cplxValue1, cplxValue2, iValue, qValue;
- __m256i cplxValueA, cplxValueB;
- __m128i cplxValue128;
-
- __m256 invScalar = _mm256_set1_ps(1.0/scalar);
- int16_t* complexVectorPtr = (int16_t*)complexVector;
- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
-
- for(;number < eighthPoints; number++){
-
- cplxValueA = _mm256_load_si256((__m256i*) complexVectorPtr);
- complexVectorPtr += 16;
-
- //cvt
- cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
- cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
- cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
- cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
- cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
- cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
-
- cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
- cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
-
- // Arrange in i1i2i3i4 format
- iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
- iValue = _mm256_permutevar8x32_ps(iValue,idx);
- // Arrange in q1q2q3q4 format
- qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
- qValue = _mm256_permutevar8x32_ps(qValue,idx);
-
- _mm256_store_ps(iBufferPtr, iValue);
- _mm256_store_ps(qBufferPtr, qValue);
-
- iBufferPtr += 8;
- qBufferPtr += 8;
- }
-
- number = eighthPoints * 8;
- complexVectorPtr = (int16_t*)&complexVector[number];
- for(; number < num_points; number++){
- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
- }
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ uint64_t number = 0;
+ const uint64_t eighthPoints = num_points / 8;
+ __m256 cplxValue1, cplxValue2, iValue, qValue;
+ __m256i cplxValueA, cplxValueB;
+ __m128i cplxValue128;
+
+ __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+ for (; number < eighthPoints; number++) {
+
+ cplxValueA = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 16;
+
+ // cvt
+ cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
+ cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
+ cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
+ cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
+ cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
+ cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
+
+ cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+ iValue = _mm256_permutevar8x32_ps(iValue, idx);
+ // Arrange in q1q2q3q4 format
+ qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
+ qValue = _mm256_permutevar8x32_ps(qValue, idx);
+
+ _mm256_store_ps(iBufferPtr, iValue);
+ _mm256_store_ps(qBufferPtr, qValue);
+
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ complexVectorPtr = (int16_t*)&complexVector[number];
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline
-void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
- const float scalar, unsigned int num_points)
+static inline void
+volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer,
+ float* qBuffer,
+ const lv_16sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- float* iBufferPtr = iBuffer;
- float* qBufferPtr = qBuffer;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
- uint64_t number = 0;
- const uint64_t quarterPoints = num_points / 4;
- __m128 cplxValue1, cplxValue2, iValue, qValue;
+ uint64_t number = 0;
+ const uint64_t quarterPoints = num_points / 4;
+ __m128 cplxValue1, cplxValue2, iValue, qValue;
- __m128 invScalar = _mm_set_ps1(1.0/scalar);
- int16_t* complexVectorPtr = (int16_t*)complexVector;
+ __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
- __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
- for(;number < quarterPoints; number++){
+ for (; number < quarterPoints; number++) {
- floatBuffer[0] = (float)(complexVectorPtr[0]);
- floatBuffer[1] = (float)(complexVectorPtr[1]);
- floatBuffer[2] = (float)(complexVectorPtr[2]);
- floatBuffer[3] = (float)(complexVectorPtr[3]);
+ floatBuffer[0] = (float)(complexVectorPtr[0]);
+ floatBuffer[1] = (float)(complexVectorPtr[1]);
+ floatBuffer[2] = (float)(complexVectorPtr[2]);
+ floatBuffer[3] = (float)(complexVectorPtr[3]);
- floatBuffer[4] = (float)(complexVectorPtr[4]);
- floatBuffer[5] = (float)(complexVectorPtr[5]);
- floatBuffer[6] = (float)(complexVectorPtr[6]);
- floatBuffer[7] = (float)(complexVectorPtr[7]);
+ floatBuffer[4] = (float)(complexVectorPtr[4]);
+ floatBuffer[5] = (float)(complexVectorPtr[5]);
+ floatBuffer[6] = (float)(complexVectorPtr[6]);
+ floatBuffer[7] = (float)(complexVectorPtr[7]);
- cplxValue1 = _mm_load_ps(&floatBuffer[0]);
- cplxValue2 = _mm_load_ps(&floatBuffer[4]);
+ cplxValue1 = _mm_load_ps(&floatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&floatBuffer[4]);
- complexVectorPtr += 8;
+ complexVectorPtr += 8;
- cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
- cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
- // Arrange in i1i2i3i4 format
- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
- // Arrange in q1q2q3q4 format
- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
- _mm_store_ps(iBufferPtr, iValue);
- _mm_store_ps(qBufferPtr, qValue);
+ _mm_store_ps(iBufferPtr, iValue);
+ _mm_store_ps(qBufferPtr, qValue);
- iBufferPtr += 4;
- qBufferPtr += 4;
- }
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
- number = quarterPoints * 4;
- complexVectorPtr = (int16_t*)&complexVector[number];
- for(; number < num_points; number++){
- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
- }
+ number = quarterPoints * 4;
+ complexVectorPtr = (int16_t*)&complexVector[number];
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
static inline void
-volk_16ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
- const float scalar, unsigned int num_points)
+volk_16ic_s32f_deinterleave_32f_x2_generic(float* iBuffer,
+ float* qBuffer,
+ const lv_16sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
- float* iBufferPtr = iBuffer;
- float* qBufferPtr = qBuffer;
- unsigned int number;
- for(number = 0; number < num_points; number++){
- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
- }
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+ unsigned int number;
+ for (number = 0; number < num_points; number++) {
+ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_16ic_s32f_deinterleave_32f_x2_neon(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
- const float scalar, unsigned int num_points)
+static inline void volk_16ic_s32f_deinterleave_32f_x2_neon(float* iBuffer,
+ float* qBuffer,
+ const lv_16sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
- float* iBufferPtr = iBuffer;
- float* qBufferPtr = qBuffer;
- unsigned int eighth_points = num_points / 4;
- unsigned int number;
- float iScalar = 1.f/scalar;
- float32x4_t invScalar;
- invScalar = vld1q_dup_f32(&iScalar);
-
- int16x4x2_t complexInput_s16;
- int32x4x2_t complexInput_s32;
- float32x4x2_t complexFloat;
-
- for(number = 0; number < eighth_points; number++){
- complexInput_s16 = vld2_s16(complexVectorPtr);
- complexInput_s32.val[0] = vmovl_s16(complexInput_s16.val[0]);
- complexInput_s32.val[1] = vmovl_s16(complexInput_s16.val[1]);
- complexFloat.val[0] = vcvtq_f32_s32(complexInput_s32.val[0]);
- complexFloat.val[1] = vcvtq_f32_s32(complexInput_s32.val[1]);
- complexFloat.val[0] = vmulq_f32(complexFloat.val[0], invScalar);
- complexFloat.val[1] = vmulq_f32(complexFloat.val[1], invScalar);
- vst1q_f32(iBufferPtr, complexFloat.val[0]);
- vst1q_f32(qBufferPtr, complexFloat.val[1]);
- complexVectorPtr += 8;
- iBufferPtr += 4;
- qBufferPtr += 4;
- }
-
- for(number = eighth_points*4; number < num_points; number++){
- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
- }
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+ unsigned int eighth_points = num_points / 4;
+ unsigned int number;
+ float iScalar = 1.f / scalar;
+ float32x4_t invScalar;
+ invScalar = vld1q_dup_f32(&iScalar);
+
+ int16x4x2_t complexInput_s16;
+ int32x4x2_t complexInput_s32;
+ float32x4x2_t complexFloat;
+
+ for (number = 0; number < eighth_points; number++) {
+ complexInput_s16 = vld2_s16(complexVectorPtr);
+ complexInput_s32.val[0] = vmovl_s16(complexInput_s16.val[0]);
+ complexInput_s32.val[1] = vmovl_s16(complexInput_s16.val[1]);
+ complexFloat.val[0] = vcvtq_f32_s32(complexInput_s32.val[0]);
+ complexFloat.val[1] = vcvtq_f32_s32(complexInput_s32.val[1]);
+ complexFloat.val[0] = vmulq_f32(complexFloat.val[0], invScalar);
+ complexFloat.val[1] = vmulq_f32(complexFloat.val[1], invScalar);
+ vst1q_f32(iBufferPtr, complexFloat.val[0]);
+ vst1q_f32(qBufferPtr, complexFloat.val[1]);
+ complexVectorPtr += 8;
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ for (number = eighth_points * 4; number < num_points; number++) {
+ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC
-extern void
-volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
- const float scalar, unsigned int num_points);
+extern void volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer,
+ float* qBuffer,
+ const lv_16sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points);
static inline void
-volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
- const float scalar, unsigned int num_points)
+volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer,
+ float* qBuffer,
+ const lv_16sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points);
+ volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(
+ iBuffer, qBuffer, complexVector, scalar, num_points);
}
#endif /* LV_HAVE_ORC */
#ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H
#define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H
-#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline
-void volk_16ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
- const float scalar, unsigned int num_points)
+static inline void
+volk_16ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer,
+ float* qBuffer,
+ const lv_16sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- float* iBufferPtr = iBuffer;
- float* qBufferPtr = qBuffer;
-
- uint64_t number = 0;
- const uint64_t eighthPoints = num_points / 8;
- __m256 cplxValue1, cplxValue2, iValue, qValue;
- __m256i cplxValueA, cplxValueB;
- __m128i cplxValue128;
-
- __m256 invScalar = _mm256_set1_ps(1.0/scalar);
- int16_t* complexVectorPtr = (int16_t*)complexVector;
- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
-
- for(;number < eighthPoints; number++){
-
- cplxValueA = _mm256_loadu_si256((__m256i*) complexVectorPtr);
- complexVectorPtr += 16;
-
- //cvt
- cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
- cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
- cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
- cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
- cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
- cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
-
- cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
- cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
-
- // Arrange in i1i2i3i4 format
- iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
- iValue = _mm256_permutevar8x32_ps(iValue,idx);
- // Arrange in q1q2q3q4 format
- qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
- qValue = _mm256_permutevar8x32_ps(qValue,idx);
-
- _mm256_storeu_ps(iBufferPtr, iValue);
- _mm256_storeu_ps(qBufferPtr, qValue);
-
- iBufferPtr += 8;
- qBufferPtr += 8;
- }
-
- number = eighthPoints * 8;
- complexVectorPtr = (int16_t*)&complexVector[number];
- for(; number < num_points; number++){
- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
- }
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ uint64_t number = 0;
+ const uint64_t eighthPoints = num_points / 8;
+ __m256 cplxValue1, cplxValue2, iValue, qValue;
+ __m256i cplxValueA, cplxValueB;
+ __m128i cplxValue128;
+
+ __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+ for (; number < eighthPoints; number++) {
+
+ cplxValueA = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 16;
+
+ // cvt
+ cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
+ cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
+ cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
+ cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
+ cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
+ cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
+
+ cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+ iValue = _mm256_permutevar8x32_ps(iValue, idx);
+ // Arrange in q1q2q3q4 format
+ qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
+ qValue = _mm256_permutevar8x32_ps(qValue, idx);
+
+ _mm256_storeu_ps(iBufferPtr, iValue);
+ _mm256_storeu_ps(qBufferPtr, qValue);
+
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ complexVectorPtr = (int16_t*)&complexVector[number];
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ }
}
#endif /* LV_HAVE_AVX2 */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_16ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
- * \endcode
+ * void volk_16ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_16sc_t*
+ * complexVector, const float scalar, unsigned int num_points){ \endcode
*
* \b Inputs
* \li complexVector: The complex input vector of 16-bit shorts.
#ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H
#define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H
-#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void
-volk_16ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_16sc_t* complexVector,
- const float scalar, unsigned int num_points)
+volk_16ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer,
+ const lv_16sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- float* iBufferPtr = iBuffer;
-
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- __m256 iFloatValue;
-
- const float iScalar= 1.0 / scalar;
- __m256 invScalar = _mm256_set1_ps(iScalar);
- __m256i complexVal, iIntVal;
- __m128i complexVal128;
- int8_t* complexVectorPtr = (int8_t*)complexVector;
-
- __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
-
- for(;number < eighthPoints; number++){
- complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
- complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
- complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
- complexVal128 = _mm256_extracti128_si256(complexVal, 0);
-
- iIntVal = _mm256_cvtepi16_epi32(complexVal128);
- iFloatValue = _mm256_cvtepi32_ps(iIntVal);
-
- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
-
- _mm256_store_ps(iBufferPtr, iFloatValue);
-
- iBufferPtr += 8;
- }
-
- number = eighthPoints * 8;
- int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
- for(; number < num_points; number++){
- *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
- sixteenTComplexVectorPtr++;
- }
-
+ float* iBufferPtr = iBuffer;
+
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ __m256 iFloatValue;
+
+ const float iScalar = 1.0 / scalar;
+ __m256 invScalar = _mm256_set1_ps(iScalar);
+ __m256i complexVal, iIntVal;
+ __m128i complexVal128;
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+ __m256i moveMask = _mm256_set_epi8(0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 13,
+ 12,
+ 9,
+ 8,
+ 5,
+ 4,
+ 1,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 13,
+ 12,
+ 9,
+ 8,
+ 5,
+ 4,
+ 1,
+ 0);
+
+ for (; number < eighthPoints; number++) {
+ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+ complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
+ complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+ complexVal128 = _mm256_extracti128_si256(complexVal, 0);
+
+ iIntVal = _mm256_cvtepi16_epi32(complexVal128);
+ iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+
+ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+
+ _mm256_store_ps(iBufferPtr, iFloatValue);
+
+ iBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
+ sixteenTComplexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
#include <smmintrin.h>
static inline void
-volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_16sc_t* complexVector,
- const float scalar, unsigned int num_points)
+volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer,
+ const lv_16sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- float* iBufferPtr = iBuffer;
-
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ float* iBufferPtr = iBuffer;
- __m128 iFloatValue;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- const float iScalar= 1.0 / scalar;
- __m128 invScalar = _mm_set_ps1(iScalar);
- __m128i complexVal, iIntVal;
- int8_t* complexVectorPtr = (int8_t*)complexVector;
+ __m128 iFloatValue;
- __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ __m128i complexVal, iIntVal;
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
- for(;number < quarterPoints; number++){
- complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
- complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+ __m128i moveMask = _mm_set_epi8(
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
- iIntVal = _mm_cvtepi16_epi32(complexVal);
- iFloatValue = _mm_cvtepi32_ps(iIntVal);
+ for (; number < quarterPoints; number++) {
+ complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
+ complexVectorPtr += 16;
+ complexVal = _mm_shuffle_epi8(complexVal, moveMask);
- iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+ iIntVal = _mm_cvtepi16_epi32(complexVal);
+ iFloatValue = _mm_cvtepi32_ps(iIntVal);
- _mm_store_ps(iBufferPtr, iFloatValue);
+ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
- iBufferPtr += 4;
- }
+ _mm_store_ps(iBufferPtr, iFloatValue);
- number = quarterPoints * 4;
- int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
- for(; number < num_points; number++){
- *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
- sixteenTComplexVectorPtr++;
- }
+ iBufferPtr += 4;
+ }
+ number = quarterPoints * 4;
+ int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
+ sixteenTComplexVectorPtr++;
+ }
}
#endif /* LV_HAVE_SSE4_1 */
#include <xmmintrin.h>
static inline void
-volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_16sc_t* complexVector,
- const float scalar, unsigned int num_points)
+volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer,
+ const lv_16sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- float* iBufferPtr = iBuffer;
+ float* iBufferPtr = iBuffer;
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
- __m128 iValue;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 iValue;
- const float iScalar = 1.0/scalar;
- __m128 invScalar = _mm_set_ps1(iScalar);
- int16_t* complexVectorPtr = (int16_t*)complexVector;
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
- __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
- for(;number < quarterPoints; number++){
- floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
- floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
- floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
- floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ for (; number < quarterPoints; number++) {
+ floatBuffer[0] = (float)(*complexVectorPtr);
+ complexVectorPtr += 2;
+ floatBuffer[1] = (float)(*complexVectorPtr);
+ complexVectorPtr += 2;
+ floatBuffer[2] = (float)(*complexVectorPtr);
+ complexVectorPtr += 2;
+ floatBuffer[3] = (float)(*complexVectorPtr);
+ complexVectorPtr += 2;
- iValue = _mm_load_ps(floatBuffer);
+ iValue = _mm_load_ps(floatBuffer);
- iValue = _mm_mul_ps(iValue, invScalar);
+ iValue = _mm_mul_ps(iValue, invScalar);
- _mm_store_ps(iBufferPtr, iValue);
+ _mm_store_ps(iBufferPtr, iValue);
- iBufferPtr += 4;
- }
-
- number = quarterPoints * 4;
- complexVectorPtr = (int16_t*)&complexVector[number];
- for(; number < num_points; number++){
- *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar;
- complexVectorPtr++;
- }
+ iBufferPtr += 4;
+ }
+ number = quarterPoints * 4;
+ complexVectorPtr = (int16_t*)&complexVector[number];
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
static inline void
-volk_16ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_16sc_t* complexVector,
- const float scalar, unsigned int num_points)
+volk_16ic_s32f_deinterleave_real_32f_generic(float* iBuffer,
+ const lv_16sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
- float* iBufferPtr = iBuffer;
- const float invScalar = 1.0 / scalar;
- for(number = 0; number < num_points; number++){
- *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
- complexVectorPtr++;
- }
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* iBufferPtr = iBuffer;
+ const float invScalar = 1.0 / scalar;
+ for (number = 0; number < num_points; number++) {
+ *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_u_H
#define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_u_H
-#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void
-volk_16ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer, const lv_16sc_t* complexVector,
- const float scalar, unsigned int num_points)
+volk_16ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer,
+ const lv_16sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- float* iBufferPtr = iBuffer;
-
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- __m256 iFloatValue;
-
- const float iScalar= 1.0 / scalar;
- __m256 invScalar = _mm256_set1_ps(iScalar);
- __m256i complexVal, iIntVal;
- __m128i complexVal128;
- int8_t* complexVectorPtr = (int8_t*)complexVector;
-
- __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
-
- for(;number < eighthPoints; number++){
- complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
- complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
- complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
- complexVal128 = _mm256_extracti128_si256(complexVal, 0);
-
- iIntVal = _mm256_cvtepi16_epi32(complexVal128);
- iFloatValue = _mm256_cvtepi32_ps(iIntVal);
-
- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
-
- _mm256_storeu_ps(iBufferPtr, iFloatValue);
-
- iBufferPtr += 8;
- }
-
- number = eighthPoints * 8;
- int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
- for(; number < num_points; number++){
- *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
- sixteenTComplexVectorPtr++;
- }
-
+ float* iBufferPtr = iBuffer;
+
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ __m256 iFloatValue;
+
+ const float iScalar = 1.0 / scalar;
+ __m256 invScalar = _mm256_set1_ps(iScalar);
+ __m256i complexVal, iIntVal;
+ __m128i complexVal128;
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+ __m256i moveMask = _mm256_set_epi8(0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 13,
+ 12,
+ 9,
+ 8,
+ 5,
+ 4,
+ 1,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 13,
+ 12,
+ 9,
+ 8,
+ 5,
+ 4,
+ 1,
+ 0);
+
+ for (; number < eighthPoints; number++) {
+ complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+ complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
+ complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+ complexVal128 = _mm256_extracti128_si256(complexVal, 0);
+
+ iIntVal = _mm256_cvtepi16_epi32(complexVal128);
+ iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+
+ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+
+ _mm256_storeu_ps(iBufferPtr, iFloatValue);
+
+ iBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
+ sixteenTComplexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_16ic_s32f_magnitude_32f(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points)
- * \endcode
+ * void volk_16ic_s32f_magnitude_32f(float* magnitudeVector, const lv_16sc_t*
+ * complexVector, const float scalar, unsigned int num_points) \endcode
*
* \b Inputs
* \li complexVector: The complex input vector of complex 16-bit shorts.
#ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
#define INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
-#include <volk/volk_common.h>
#include <inttypes.h>
-#include <stdio.h>
#include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_16ic_s32f_magnitude_32f_a_avx2(float* magnitudeVector, const lv_16sc_t* complexVector,
- const float scalar, unsigned int num_points)
+static inline void volk_16ic_s32f_magnitude_32f_a_avx2(float* magnitudeVector,
+ const lv_16sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
+ __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
- __m256 invScalar = _mm256_set1_ps(1.0/scalar);
+ __m256 cplxValue1, cplxValue2, result;
+ __m256i int1, int2;
+ __m128i short1, short2;
+ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
- __m256 cplxValue1, cplxValue2, result;
- __m256i int1, int2;
- __m128i short1, short2;
- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
+ for (; number < eighthPoints; number++) {
- for(;number < eighthPoints; number++){
-
- int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
- complexVectorPtr += 16;
- short1 = _mm256_extracti128_si256(int1,0);
- short2 = _mm256_extracti128_si256(int1,1);
+ int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 16;
+ short1 = _mm256_extracti128_si256(int1, 0);
+ short2 = _mm256_extracti128_si256(int1, 1);
- int1 = _mm256_cvtepi16_epi32(short1);
- int2 = _mm256_cvtepi16_epi32(short2);
- cplxValue1 = _mm256_cvtepi32_ps(int1);
- cplxValue2 = _mm256_cvtepi32_ps(int2);
+ int1 = _mm256_cvtepi16_epi32(short1);
+ int2 = _mm256_cvtepi16_epi32(short2);
+ cplxValue1 = _mm256_cvtepi32_ps(int1);
+ cplxValue2 = _mm256_cvtepi32_ps(int2);
- cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
- cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
+ cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
- result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
- result = _mm256_permutevar8x32_ps(result, idx);
+ result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+ result = _mm256_permutevar8x32_ps(result, idx);
- result = _mm256_sqrt_ps(result); // Square root the values
+ result = _mm256_sqrt_ps(result); // Square root the values
- _mm256_store_ps(magnitudeVectorPtr, result);
+ _mm256_store_ps(magnitudeVectorPtr, result);
- magnitudeVectorPtr += 8;
- }
+ magnitudeVectorPtr += 8;
+ }
- number = eighthPoints * 8;
- magnitudeVectorPtr = &magnitudeVector[number];
- complexVectorPtr = (const int16_t*)&complexVector[number];
- for(; number < num_points; number++){
- float val1Real = (float)(*complexVectorPtr++) / scalar;
- float val1Imag = (float)(*complexVectorPtr++) / scalar;
- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
- }
+ number = eighthPoints * 8;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for (; number < num_points; number++) {
+ float val1Real = (float)(*complexVectorPtr++) / scalar;
+ float val1Imag = (float)(*complexVectorPtr++) / scalar;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
-static inline void
-volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, const lv_16sc_t* complexVector,
- const float scalar, unsigned int num_points)
+static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector,
+ const lv_16sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
- __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ __m128 invScalar = _mm_set_ps1(1.0 / scalar);
- __m128 cplxValue1, cplxValue2, result;
+ __m128 cplxValue1, cplxValue2, result;
- __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
+ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
- for(;number < quarterPoints; number++){
+ for (; number < quarterPoints; number++) {
- inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
- inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
- inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
- inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
- inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
- inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
- inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
- inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+ inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+ inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+ inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+ inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
- cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
- cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
- complexVectorPtr += 8;
+ complexVectorPtr += 8;
- cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
- cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
- result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
- result = _mm_sqrt_ps(result); // Square root the values
+ result = _mm_sqrt_ps(result); // Square root the values
- _mm_store_ps(magnitudeVectorPtr, result);
+ _mm_store_ps(magnitudeVectorPtr, result);
- magnitudeVectorPtr += 4;
- }
+ magnitudeVectorPtr += 4;
+ }
- number = quarterPoints * 4;
- magnitudeVectorPtr = &magnitudeVector[number];
- complexVectorPtr = (const int16_t*)&complexVector[number];
- for(; number < num_points; number++){
- float val1Real = (float)(*complexVectorPtr++) / scalar;
- float val1Imag = (float)(*complexVectorPtr++) / scalar;
- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
- }
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for (; number < num_points; number++) {
+ float val1Real = (float)(*complexVectorPtr++) / scalar;
+ float val1Imag = (float)(*complexVectorPtr++) / scalar;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, const lv_16sc_t* complexVector,
- const float scalar, unsigned int num_points)
+static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector,
+ const lv_16sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
- const float iScalar = 1.0 / scalar;
- __m128 invScalar = _mm_set_ps1(iScalar);
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
- __m128 cplxValue1, cplxValue2, result, re, im;
+ __m128 cplxValue1, cplxValue2, result, re, im;
- __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
+ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
- for(;number < quarterPoints; number++){
- inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
- inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
- inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
- inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+ for (; number < quarterPoints; number++) {
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
- inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
- inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
- inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
- inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+ inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+ inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+ inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+ inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
- cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
- cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
- re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
- im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
+ re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
+ im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
- complexVectorPtr += 8;
+ complexVectorPtr += 8;
- cplxValue1 = _mm_mul_ps(re, invScalar);
- cplxValue2 = _mm_mul_ps(im, invScalar);
+ cplxValue1 = _mm_mul_ps(re, invScalar);
+ cplxValue2 = _mm_mul_ps(im, invScalar);
- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
- result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+ result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
- result = _mm_sqrt_ps(result); // Square root the values
+ result = _mm_sqrt_ps(result); // Square root the values
- _mm_store_ps(magnitudeVectorPtr, result);
+ _mm_store_ps(magnitudeVectorPtr, result);
- magnitudeVectorPtr += 4;
- }
+ magnitudeVectorPtr += 4;
+ }
- number = quarterPoints * 4;
- magnitudeVectorPtr = &magnitudeVector[number];
- complexVectorPtr = (const int16_t*)&complexVector[number];
- for(; number < num_points; number++){
- float val1Real = (float)(*complexVectorPtr++) * iScalar;
- float val1Imag = (float)(*complexVectorPtr++) * iScalar;
- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
- }
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for (; number < num_points; number++) {
+ float val1Real = (float)(*complexVectorPtr++) * iScalar;
+ float val1Imag = (float)(*complexVectorPtr++) * iScalar;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
}
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector, const lv_16sc_t* complexVector,
- const float scalar, unsigned int num_points)
+static inline void volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector,
+ const lv_16sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
- unsigned int number = 0;
- const float invScalar = 1.0 / scalar;
- for(number = 0; number < num_points; number++){
- float real = ( (float) (*complexVectorPtr++)) * invScalar;
- float imag = ( (float) (*complexVectorPtr++)) * invScalar;
- *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
- }
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ const float invScalar = 1.0 / scalar;
+ for (number = 0; number < num_points; number++) {
+ float real = ((float)(*complexVectorPtr++)) * invScalar;
+ float imag = ((float)(*complexVectorPtr++)) * invScalar;
+ *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC_DISABLED
-extern void
-volk_16ic_s32f_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector,
- const float scalar, unsigned int num_points);
+extern void volk_16ic_s32f_magnitude_32f_a_orc_impl(float* magnitudeVector,
+ const lv_16sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points);
-static inline void
-volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector, const lv_16sc_t* complexVector,
- const float scalar, unsigned int num_points)
+static inline void volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector,
+ const lv_16sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- volk_16ic_s32f_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, scalar, num_points);
+ volk_16ic_s32f_magnitude_32f_a_orc_impl(
+ magnitudeVector, complexVector, scalar, num_points);
}
#endif /* LV_HAVE_ORC */
#ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_u_H
#define INCLUDED_volk_16ic_s32f_magnitude_32f_u_H
-#include <volk/volk_common.h>
#include <inttypes.h>
-#include <stdio.h>
#include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_16ic_s32f_magnitude_32f_u_avx2(float* magnitudeVector, const lv_16sc_t* complexVector,
- const float scalar, unsigned int num_points)
+static inline void volk_16ic_s32f_magnitude_32f_u_avx2(float* magnitudeVector,
+ const lv_16sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
+ __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
- __m256 invScalar = _mm256_set1_ps(1.0/scalar);
+ __m256 cplxValue1, cplxValue2, result;
+ __m256i int1, int2;
+ __m128i short1, short2;
+ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
- __m256 cplxValue1, cplxValue2, result;
- __m256i int1, int2;
- __m128i short1, short2;
- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
+ for (; number < eighthPoints; number++) {
- for(;number < eighthPoints; number++){
-
- int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
- complexVectorPtr += 16;
- short1 = _mm256_extracti128_si256(int1,0);
- short2 = _mm256_extracti128_si256(int1,1);
+ int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 16;
+ short1 = _mm256_extracti128_si256(int1, 0);
+ short2 = _mm256_extracti128_si256(int1, 1);
- int1 = _mm256_cvtepi16_epi32(short1);
- int2 = _mm256_cvtepi16_epi32(short2);
- cplxValue1 = _mm256_cvtepi32_ps(int1);
- cplxValue2 = _mm256_cvtepi32_ps(int2);
+ int1 = _mm256_cvtepi16_epi32(short1);
+ int2 = _mm256_cvtepi16_epi32(short2);
+ cplxValue1 = _mm256_cvtepi32_ps(int1);
+ cplxValue2 = _mm256_cvtepi32_ps(int2);
- cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
- cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
+ cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
- result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
- result = _mm256_permutevar8x32_ps(result, idx);
+ result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+ result = _mm256_permutevar8x32_ps(result, idx);
- result = _mm256_sqrt_ps(result); // Square root the values
+ result = _mm256_sqrt_ps(result); // Square root the values
- _mm256_storeu_ps(magnitudeVectorPtr, result);
+ _mm256_storeu_ps(magnitudeVectorPtr, result);
- magnitudeVectorPtr += 8;
- }
+ magnitudeVectorPtr += 8;
+ }
- number = eighthPoints * 8;
- magnitudeVectorPtr = &magnitudeVector[number];
- complexVectorPtr = (const int16_t*)&complexVector[number];
- for(; number < num_points; number++){
- float val1Real = (float)(*complexVectorPtr++) / scalar;
- float val1Imag = (float)(*complexVectorPtr++) / scalar;
- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
- }
+ number = eighthPoints * 8;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for (; number < num_points; number++) {
+ float val1Real = (float)(*complexVectorPtr++) / scalar;
+ float val1Imag = (float)(*complexVectorPtr++) / scalar;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
}
#endif /* LV_HAVE_AVX2 */
#endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_u_H */
-
*
* \b Overview
*
- * Multiplies two input complex vectors (16-bit integer each component) and accumulates them,
- * storing the result. Results are saturated so never go beyond the limits of the data type.
+ * Multiplies two input complex vectors (16-bit integer each component) and accumulates
+ * them, storing the result. Results are saturated so never go beyond the limits of the
+ * data type.
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_16ic_x2_dot_prod_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points);
- * \endcode
+ * void volk_16ic_x2_dot_prod_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const
+ * lv_16sc_t* in_b, unsigned int num_points); \endcode
*
* \b Inputs
* \li in_a: One of the vectors to be multiplied and accumulated.
* \li in_b: The other vector to be multiplied and accumulated.
- * \li num_points: Number of complex values to be multiplied together, accumulated and stored into \p result
+ * \li num_points: Number of complex values to be multiplied together, accumulated and
+ * stored into \p result
*
* \b Outputs
* \li result: Value of the accumulated result.
#ifndef INCLUDED_volk_16ic_x2_dot_prod_16ic_H
#define INCLUDED_volk_16ic_x2_dot_prod_16ic_H
+#include <volk/saturation_arithmetic.h>
#include <volk/volk_common.h>
#include <volk/volk_complex.h>
-#include <volk/saturation_arithmetic.h>
#ifdef LV_HAVE_GENERIC
-static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result,
+ const lv_16sc_t* in_a,
+ const lv_16sc_t* in_b,
+ unsigned int num_points)
{
result[0] = lv_cmake((int16_t)0, (int16_t)0);
unsigned int n;
- for (n = 0; n < num_points; n++)
- {
- lv_16sc_t tmp = in_a[n] * in_b[n];
- result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp) ));
- }
+ for (n = 0; n < num_points; n++) {
+ lv_16sc_t tmp = in_a[n] * in_b[n];
+ result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)),
+ sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp)));
+ }
}
#endif /*LV_HAVE_GENERIC*/
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out,
+ const lv_16sc_t* in_a,
+ const lv_16sc_t* in_b,
+ unsigned int num_points)
{
lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
const lv_16sc_t* _in_b = in_b;
lv_16sc_t* _out = out;
- if (sse_iters > 0)
- {
- __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc;
- __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
+ if (sse_iters > 0) {
+ __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
+ realcacc, imagcacc;
+ __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
- realcacc = _mm_setzero_si128();
- imagcacc = _mm_setzero_si128();
+ realcacc = _mm_setzero_si128();
+ imagcacc = _mm_setzero_si128();
- mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
- mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
+ mask_imag = _mm_set_epi8(
+ 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
+ mask_real = _mm_set_epi8(
+ 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
- for(number = 0; number < sse_iters; number++)
- {
- // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
- a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
- __VOLK_PREFETCH(_in_a + 8);
- b = _mm_load_si128((__m128i*)_in_b);
- __VOLK_PREFETCH(_in_b + 8);
- c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
+ for (number = 0; number < sse_iters; number++) {
+ // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
+ a = _mm_load_si128(
+ (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+ __VOLK_PREFETCH(_in_a + 8);
+ b = _mm_load_si128((__m128i*)_in_b);
+ __VOLK_PREFETCH(_in_b + 8);
+ c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
- c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
- real = _mm_subs_epi16(c, c_sr);
+ c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
+ // zeros, and store the results in dst.
+ real = _mm_subs_epi16(c, c_sr);
- b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
- a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+ b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+ a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
- imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
- imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+ imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+ imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
- imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic!
+ imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
- realcacc = _mm_adds_epi16(realcacc, real);
- imagcacc = _mm_adds_epi16(imagcacc, imag);
+ realcacc = _mm_adds_epi16(realcacc, real);
+ imagcacc = _mm_adds_epi16(imagcacc, imag);
- _in_a += 4;
- _in_b += 4;
- }
+ _in_a += 4;
+ _in_b += 4;
+ }
- realcacc = _mm_and_si128(realcacc, mask_real);
- imagcacc = _mm_and_si128(imagcacc, mask_imag);
+ realcacc = _mm_and_si128(realcacc, mask_real);
+ imagcacc = _mm_and_si128(imagcacc, mask_imag);
- a = _mm_or_si128(realcacc, imagcacc);
+ a = _mm_or_si128(realcacc, imagcacc);
- _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
+ _mm_store_si128((__m128i*)dotProductVector,
+ a); // Store the results back into the dot product vector
- for (number = 0; number < 4; ++number)
- {
- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
- }
+ for (number = 0; number < 4; ++number) {
+ dotProduct = lv_cmake(
+ sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
+ sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
}
+ }
- for (number = 0; number < (num_points % 4); ++number)
- {
- lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
- }
+ for (number = 0; number < (num_points % 4); ++number) {
+ lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
+ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
+ sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
+ }
*_out = dotProduct;
}
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out,
+ const lv_16sc_t* in_a,
+ const lv_16sc_t* in_b,
+ unsigned int num_points)
{
lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
lv_16sc_t* _out = out;
unsigned int number;
- if (sse_iters > 0)
- {
- __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
- __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
+ if (sse_iters > 0) {
+ __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
+ realcacc, imagcacc, result;
+ __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
- realcacc = _mm_setzero_si128();
- imagcacc = _mm_setzero_si128();
+ realcacc = _mm_setzero_si128();
+ imagcacc = _mm_setzero_si128();
- mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
- mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
+ mask_imag = _mm_set_epi8(
+ 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
+ mask_real = _mm_set_epi8(
+ 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
- for(number = 0; number < sse_iters; number++)
- {
- // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
- a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
- __VOLK_PREFETCH(_in_a + 8);
- b = _mm_loadu_si128((__m128i*)_in_b);
- __VOLK_PREFETCH(_in_b + 8);
- c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
+ for (number = 0; number < sse_iters; number++) {
+ // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
+ a = _mm_loadu_si128(
+ (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+ __VOLK_PREFETCH(_in_a + 8);
+ b = _mm_loadu_si128((__m128i*)_in_b);
+ __VOLK_PREFETCH(_in_b + 8);
+ c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
- c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
- real = _mm_subs_epi16(c, c_sr);
+ c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
+ // zeros, and store the results in dst.
+ real = _mm_subs_epi16(c, c_sr);
- b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
- a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+ b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+ a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
- imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
- imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+ imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+ imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
- imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic!
+ imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
- realcacc = _mm_adds_epi16(realcacc, real);
- imagcacc = _mm_adds_epi16(imagcacc, imag);
+ realcacc = _mm_adds_epi16(realcacc, real);
+ imagcacc = _mm_adds_epi16(imagcacc, imag);
- _in_a += 4;
- _in_b += 4;
- }
+ _in_a += 4;
+ _in_b += 4;
+ }
- realcacc = _mm_and_si128(realcacc, mask_real);
- imagcacc = _mm_and_si128(imagcacc, mask_imag);
+ realcacc = _mm_and_si128(realcacc, mask_real);
+ imagcacc = _mm_and_si128(imagcacc, mask_imag);
- result = _mm_or_si128(realcacc, imagcacc);
+ result = _mm_or_si128(realcacc, imagcacc);
- _mm_storeu_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector
+ _mm_storeu_si128((__m128i*)dotProductVector,
+ result); // Store the results back into the dot product vector
- for (number = 0; number < 4; ++number)
- {
- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
- }
+ for (number = 0; number < 4; ++number) {
+ dotProduct = lv_cmake(
+ sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
+ sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
}
+ }
- for (number = 0; number < (num_points % 4); ++number)
- {
- lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
- }
+ for (number = 0; number < (num_points % 4); ++number) {
+ lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
+ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
+ sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
+ }
*_out = dotProduct;
}
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out,
+ const lv_16sc_t* in_a,
+ const lv_16sc_t* in_b,
+ unsigned int num_points)
{
lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
lv_16sc_t* _out = out;
unsigned int number;
- if (avx_iters > 0)
- {
- __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
- __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
-
- realcacc = _mm256_setzero_si256();
- imagcacc = _mm256_setzero_si256();
-
- mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
- mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
-
- for(number = 0; number < avx_iters; number++)
- {
- a = _mm256_loadu_si256((__m256i*)_in_a);
- __VOLK_PREFETCH(_in_a + 16);
- b = _mm256_loadu_si256((__m256i*)_in_b);
- __VOLK_PREFETCH(_in_b + 16);
- c = _mm256_mullo_epi16(a, b);
-
- c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
- real = _mm256_subs_epi16(c, c_sr);
-
- b_sl = _mm256_slli_si256(b, 2);
- a_sl = _mm256_slli_si256(a, 2);
-
- imag1 = _mm256_mullo_epi16(a, b_sl);
- imag2 = _mm256_mullo_epi16(b, a_sl);
-
- imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic!
-
- realcacc = _mm256_adds_epi16(realcacc, real);
- imagcacc = _mm256_adds_epi16(imagcacc, imag);
-
- _in_a += 8;
- _in_b += 8;
- }
+ if (avx_iters > 0) {
+ __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
+ realcacc, imagcacc, result;
+ __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
+
+ realcacc = _mm256_setzero_si256();
+ imagcacc = _mm256_setzero_si256();
+
+ mask_imag = _mm256_set_epi8(0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0);
+ mask_real = _mm256_set_epi8(0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF);
+
+ for (number = 0; number < avx_iters; number++) {
+ a = _mm256_loadu_si256((__m256i*)_in_a);
+ __VOLK_PREFETCH(_in_a + 16);
+ b = _mm256_loadu_si256((__m256i*)_in_b);
+ __VOLK_PREFETCH(_in_b + 16);
+ c = _mm256_mullo_epi16(a, b);
+
+ c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting
+ // in zeros, and store the results in dst.
+ real = _mm256_subs_epi16(c, c_sr);
+
+ b_sl = _mm256_slli_si256(b, 2);
+ a_sl = _mm256_slli_si256(a, 2);
+
+ imag1 = _mm256_mullo_epi16(a, b_sl);
+ imag2 = _mm256_mullo_epi16(b, a_sl);
+
+ imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
+
+ realcacc = _mm256_adds_epi16(realcacc, real);
+ imagcacc = _mm256_adds_epi16(imagcacc, imag);
+
+ _in_a += 8;
+ _in_b += 8;
+ }
- realcacc = _mm256_and_si256(realcacc, mask_real);
- imagcacc = _mm256_and_si256(imagcacc, mask_imag);
+ realcacc = _mm256_and_si256(realcacc, mask_real);
+ imagcacc = _mm256_and_si256(imagcacc, mask_imag);
- result = _mm256_or_si256(realcacc, imagcacc);
+ result = _mm256_or_si256(realcacc, imagcacc);
- _mm256_storeu_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
- _mm256_zeroupper();
+ _mm256_storeu_si256((__m256i*)dotProductVector,
+ result); // Store the results back into the dot product vector
+ _mm256_zeroupper();
- for (number = 0; number < 8; ++number)
- {
- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
- }
+ for (number = 0; number < 8; ++number) {
+ dotProduct = lv_cmake(
+ sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
+ sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
}
+ }
- for (number = 0; number < (num_points % 8); ++number)
- {
- lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
- }
+ for (number = 0; number < (num_points % 8); ++number) {
+ lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
+ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
+ sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
+ }
*_out = dotProduct;
}
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out,
+ const lv_16sc_t* in_a,
+ const lv_16sc_t* in_b,
+ unsigned int num_points)
{
lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
lv_16sc_t* _out = out;
unsigned int number;
- if (avx_iters > 0)
- {
- __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
- __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
-
- realcacc = _mm256_setzero_si256();
- imagcacc = _mm256_setzero_si256();
-
- mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
- mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
-
- for(number = 0; number < avx_iters; number++)
- {
- a = _mm256_load_si256((__m256i*)_in_a);
- __VOLK_PREFETCH(_in_a + 16);
- b = _mm256_load_si256((__m256i*)_in_b);
- __VOLK_PREFETCH(_in_b + 16);
- c = _mm256_mullo_epi16(a, b);
-
- c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
- real = _mm256_subs_epi16(c, c_sr);
-
- b_sl = _mm256_slli_si256(b, 2);
- a_sl = _mm256_slli_si256(a, 2);
-
- imag1 = _mm256_mullo_epi16(a, b_sl);
- imag2 = _mm256_mullo_epi16(b, a_sl);
-
- imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic!
-
- realcacc = _mm256_adds_epi16(realcacc, real);
- imagcacc = _mm256_adds_epi16(imagcacc, imag);
-
- _in_a += 8;
- _in_b += 8;
- }
+ if (avx_iters > 0) {
+ __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
+ realcacc, imagcacc, result;
+ __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
+
+ realcacc = _mm256_setzero_si256();
+ imagcacc = _mm256_setzero_si256();
+
+ mask_imag = _mm256_set_epi8(0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0);
+ mask_real = _mm256_set_epi8(0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF);
+
+ for (number = 0; number < avx_iters; number++) {
+ a = _mm256_load_si256((__m256i*)_in_a);
+ __VOLK_PREFETCH(_in_a + 16);
+ b = _mm256_load_si256((__m256i*)_in_b);
+ __VOLK_PREFETCH(_in_b + 16);
+ c = _mm256_mullo_epi16(a, b);
+
+ c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting
+ // in zeros, and store the results in dst.
+ real = _mm256_subs_epi16(c, c_sr);
+
+ b_sl = _mm256_slli_si256(b, 2);
+ a_sl = _mm256_slli_si256(a, 2);
+
+ imag1 = _mm256_mullo_epi16(a, b_sl);
+ imag2 = _mm256_mullo_epi16(b, a_sl);
+
+ imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
+
+ realcacc = _mm256_adds_epi16(realcacc, real);
+ imagcacc = _mm256_adds_epi16(imagcacc, imag);
+
+ _in_a += 8;
+ _in_b += 8;
+ }
- realcacc = _mm256_and_si256(realcacc, mask_real);
- imagcacc = _mm256_and_si256(imagcacc, mask_imag);
+ realcacc = _mm256_and_si256(realcacc, mask_real);
+ imagcacc = _mm256_and_si256(imagcacc, mask_imag);
- result = _mm256_or_si256(realcacc, imagcacc);
+ result = _mm256_or_si256(realcacc, imagcacc);
- _mm256_store_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
- _mm256_zeroupper();
+ _mm256_store_si256((__m256i*)dotProductVector,
+ result); // Store the results back into the dot product vector
+ _mm256_zeroupper();
- for (number = 0; number < 8; ++number)
- {
- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
- }
+ for (number = 0; number < 8; ++number) {
+ dotProduct = lv_cmake(
+ sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
+ sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
}
+ }
- for (number = 0; number < (num_points % 8); ++number)
- {
- lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
- }
+ for (number = 0; number < (num_points % 8); ++number) {
+ lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
+ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
+ sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
+ }
*_out = dotProduct;
}
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out,
+ const lv_16sc_t* in_a,
+ const lv_16sc_t* in_b,
+ unsigned int num_points)
{
unsigned int quarter_points = num_points / 4;
unsigned int number;
- lv_16sc_t* a_ptr = (lv_16sc_t*) in_a;
- lv_16sc_t* b_ptr = (lv_16sc_t*) in_b;
+ lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
+ lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
*out = lv_cmake((int16_t)0, (int16_t)0);
- if (quarter_points > 0)
- {
- // for 2-lane vectors, 1st lane holds the real part,
- // 2nd lane holds the imaginary part
- int16x4x2_t a_val, b_val, c_val, accumulator;
- int16x4x2_t tmp_real, tmp_imag;
- __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
- accumulator.val[0] = vdup_n_s16(0);
- accumulator.val[1] = vdup_n_s16(0);
- lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
-
- for(number = 0; number < quarter_points; ++number)
- {
- a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
- b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
- __VOLK_PREFETCH(a_ptr + 8);
- __VOLK_PREFETCH(b_ptr + 8);
-
- // multiply the real*real and imag*imag to get real result
- // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
- tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
- // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
- tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
-
- // Multiply cross terms to get the imaginary result
- // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
- tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
- // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
- tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
-
- c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]);
- c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
-
- accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]);
- accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]);
-
- a_ptr += 4;
- b_ptr += 4;
- }
-
- vst2_s16((int16_t*)accum_result, accumulator);
- for (number = 0; number < 4; ++number)
- {
- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[number])));
- }
-
- *out = dotProduct;
+ if (quarter_points > 0) {
+ // for 2-lane vectors, 1st lane holds the real part,
+ // 2nd lane holds the imaginary part
+ int16x4x2_t a_val, b_val, c_val, accumulator;
+ int16x4x2_t tmp_real, tmp_imag;
+ __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
+ accumulator.val[0] = vdup_n_s16(0);
+ accumulator.val[1] = vdup_n_s16(0);
+ lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
+
+ for (number = 0; number < quarter_points; ++number) {
+ a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ __VOLK_PREFETCH(a_ptr + 8);
+ __VOLK_PREFETCH(b_ptr + 8);
+
+ // multiply the real*real and imag*imag to get real result
+ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+ tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
+ // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+ tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
+
+ // Multiply cross terms to get the imaginary result
+ // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+ tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
+ // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+ tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
+
+ c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]);
+ c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
+
+ accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]);
+ accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]);
+
+ a_ptr += 4;
+ b_ptr += 4;
}
- // tail case
- for(number = quarter_points * 4; number < num_points; ++number)
- {
- *out += (*a_ptr++) * (*b_ptr++);
+ vst2_s16((int16_t*)accum_result, accumulator);
+ for (number = 0; number < 4; ++number) {
+ dotProduct = lv_cmake(
+ sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[number])),
+ sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[number])));
}
+
+ *out = dotProduct;
+ }
+
+ // tail case
+ for (number = quarter_points * 4; number < num_points; ++number) {
+ *out += (*a_ptr++) * (*b_ptr++);
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out,
+ const lv_16sc_t* in_a,
+ const lv_16sc_t* in_b,
+ unsigned int num_points)
{
unsigned int quarter_points = num_points / 4;
unsigned int number;
- lv_16sc_t* a_ptr = (lv_16sc_t*) in_a;
- lv_16sc_t* b_ptr = (lv_16sc_t*) in_b;
+ lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
+ lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
// for 2-lane vectors, 1st lane holds the real part,
// 2nd lane holds the imaginary part
int16x4x2_t a_val, b_val, accumulator;
accumulator.val[0] = vdup_n_s16(0);
accumulator.val[1] = vdup_n_s16(0);
- for(number = 0; number < quarter_points; ++number)
- {
- a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
- b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
- __VOLK_PREFETCH(a_ptr + 8);
- __VOLK_PREFETCH(b_ptr + 8);
+ for (number = 0; number < quarter_points; ++number) {
+ a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ __VOLK_PREFETCH(a_ptr + 8);
+ __VOLK_PREFETCH(b_ptr + 8);
- tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
- tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
+ tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
+ tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
- // use multiply accumulate/subtract to get result
- tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]);
- tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]);
+ // use multiply accumulate/subtract to get result
+ tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]);
+ tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]);
- accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]);
- accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]);
+ accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]);
+ accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]);
- a_ptr += 4;
- b_ptr += 4;
- }
+ a_ptr += 4;
+ b_ptr += 4;
+ }
vst2_s16((int16_t*)accum_result, accumulator);
*out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
// tail case
- for(number = quarter_points * 4; number < num_points; ++number)
- {
- *out += (*a_ptr++) * (*b_ptr++);
- }
+ for (number = quarter_points * 4; number < num_points; ++number) {
+ *out += (*a_ptr++) * (*b_ptr++);
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out,
+ const lv_16sc_t* in_a,
+ const lv_16sc_t* in_b,
+ unsigned int num_points)
{
unsigned int quarter_points = num_points / 4;
unsigned int number;
- lv_16sc_t* a_ptr = (lv_16sc_t*) in_a;
- lv_16sc_t* b_ptr = (lv_16sc_t*) in_b;
+ lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
+ lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
// for 2-lane vectors, 1st lane holds the real part,
// 2nd lane holds the imaginary part
int16x4x2_t a_val, b_val, accumulator1, accumulator2;
accumulator2.val[0] = vdup_n_s16(0);
accumulator2.val[1] = vdup_n_s16(0);
- for(number = 0; number < quarter_points; ++number)
- {
- a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
- b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
- __VOLK_PREFETCH(a_ptr + 8);
- __VOLK_PREFETCH(b_ptr + 8);
+ for (number = 0; number < quarter_points; ++number) {
+ a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ __VOLK_PREFETCH(a_ptr + 8);
+ __VOLK_PREFETCH(b_ptr + 8);
- // use 2 accumulators to remove inter-instruction data dependencies
- accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]);
- accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]);
- accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]);
- accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]);
+ // use 2 accumulators to remove inter-instruction data dependencies
+ accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]);
+ accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]);
+ accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]);
+ accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]);
- a_ptr += 4;
- b_ptr += 4;
- }
+ a_ptr += 4;
+ b_ptr += 4;
+ }
accumulator1.val[0] = vqadd_s16(accumulator1.val[0], accumulator2.val[0]);
accumulator1.val[1] = vqadd_s16(accumulator1.val[1], accumulator2.val[1]);
*out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
// tail case
- for(number = quarter_points * 4; number < num_points; ++number)
- {
- *out += (*a_ptr++) * (*b_ptr++);
- }
+ for (number = quarter_points * 4; number < num_points; ++number) {
+ *out += (*a_ptr++) * (*b_ptr++);
+ }
}
#endif /* LV_HAVE_NEON */
*
* \b Overview
*
- * Multiplies two input complex vectors, point-by-point, storing the result in the third vector.
- * WARNING: Saturation is not checked.
+ * Multiplies two input complex vectors, point-by-point, storing the result in the third
+ * vector. WARNING: Saturation is not checked.
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_16ic_x2_multiply_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points);
- * \endcode
+ * void volk_16ic_x2_multiply_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const
+ * lv_16sc_t* in_b, unsigned int num_points); \endcode
*
* \b Inputs
* \li in_a: One of the vectors to be multiplied.
* \li in_b: The other vector to be multiplied.
- * \li num_points: The number of complex data points to be multiplied from both input vectors.
+ * \li num_points: The number of complex data points to be multiplied from both input
+ * vectors.
*
* \b Outputs
* \li result: The vector where the results will be stored.
#ifdef LV_HAVE_GENERIC
-static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result,
+ const lv_16sc_t* in_a,
+ const lv_16sc_t* in_b,
+ unsigned int num_points)
{
unsigned int n;
- for (n = 0; n < num_points; n++)
- {
- result[n] = in_a[n] * in_b[n];
- }
+ for (n = 0; n < num_points; n++) {
+ result[n] = in_a[n] * in_b[n];
+ }
}
#endif /*LV_HAVE_GENERIC*/
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out,
+ const lv_16sc_t* in_a,
+ const lv_16sc_t* in_b,
+ unsigned int num_points)
{
const unsigned int sse_iters = num_points / 4;
- __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, result;
+ __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
+ result;
- mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
- mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
+ mask_imag = _mm_set_epi8(
+ 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
+ mask_real = _mm_set_epi8(
+ 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
const lv_16sc_t* _in_a = in_a;
const lv_16sc_t* _in_b = in_b;
lv_16sc_t* _out = out;
unsigned int number;
- for(number = 0; number < sse_iters; number++)
- {
- a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
- b = _mm_load_si128((__m128i*)_in_b);
- c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
+ for (number = 0; number < sse_iters; number++) {
+ a = _mm_load_si128(
+ (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+ b = _mm_load_si128((__m128i*)_in_b);
+ c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
- c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
- real = _mm_subs_epi16 (c, c_sr);
- real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
+ c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
+ // zeros, and store the results in dst.
+ real = _mm_subs_epi16(c, c_sr);
+ real = _mm_and_si128(real,
+ mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
- b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
- a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+ b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+ a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
- imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
- imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+ imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+ imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
- imag = _mm_adds_epi16(imag1, imag2);
- imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
+ imag = _mm_adds_epi16(imag1, imag2);
+ imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
- result = _mm_or_si128 (real, imag);
+ result = _mm_or_si128(real, imag);
- _mm_store_si128((__m128i*)_out, result);
+ _mm_store_si128((__m128i*)_out, result);
- _in_a += 4;
- _in_b += 4;
- _out += 4;
- }
+ _in_a += 4;
+ _in_b += 4;
+ _out += 4;
+ }
- for (number = sse_iters * 4; number < num_points; ++number)
- {
- *_out++ = (*_in_a++) * (*_in_b++);
- }
+ for (number = sse_iters * 4; number < num_points; ++number) {
+ *_out++ = (*_in_a++) * (*_in_b++);
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out,
+ const lv_16sc_t* in_a,
+ const lv_16sc_t* in_b,
+ unsigned int num_points)
{
const unsigned int sse_iters = num_points / 4;
- __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result;
+ __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
+ result;
- mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
- mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
+ mask_imag = _mm_set_epi8(
+ 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
+ mask_real = _mm_set_epi8(
+ 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
const lv_16sc_t* _in_a = in_a;
const lv_16sc_t* _in_b = in_b;
lv_16sc_t* _out = out;
unsigned int number;
- for(number = 0; number < sse_iters; number++)
- {
- a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
- b = _mm_loadu_si128((__m128i*)_in_b);
- c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
+ for (number = 0; number < sse_iters; number++) {
+ a = _mm_loadu_si128(
+ (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+ b = _mm_loadu_si128((__m128i*)_in_b);
+ c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
- c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
- real = _mm_subs_epi16 (c, c_sr);
- real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
+ c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
+ // zeros, and store the results in dst.
+ real = _mm_subs_epi16(c, c_sr);
+ real = _mm_and_si128(real,
+ mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
- b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
- a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+ b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+ a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
- imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
- imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+ imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+ imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
- imag = _mm_adds_epi16(imag1, imag2);
- imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
+ imag = _mm_adds_epi16(imag1, imag2);
+ imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
- result = _mm_or_si128 (real, imag);
+ result = _mm_or_si128(real, imag);
- _mm_storeu_si128((__m128i*)_out, result);
+ _mm_storeu_si128((__m128i*)_out, result);
- _in_a += 4;
- _in_b += 4;
- _out += 4;
- }
+ _in_a += 4;
+ _in_b += 4;
+ _out += 4;
+ }
- for (number = sse_iters * 4; number < num_points; ++number)
- {
- *_out++ = (*_in_a++) * (*_in_b++);
- }
+ for (number = sse_iters * 4; number < num_points; ++number) {
+ *_out++ = (*_in_a++) * (*_in_b++);
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out,
+ const lv_16sc_t* in_a,
+ const lv_16sc_t* in_b,
+ unsigned int num_points)
{
unsigned int number = 0;
const unsigned int avx2_points = num_points / 8;
const lv_16sc_t* _in_b = in_b;
lv_16sc_t* _out = out;
- __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
-
- const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
- const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
-
- for(;number < avx2_points; number++)
- {
- a = _mm256_loadu_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
- b = _mm256_loadu_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
- c = _mm256_mullo_epi16(a, b);
-
- c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
- real = _mm256_subs_epi16(c, c_sr);
- real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
-
- b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
- a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
-
- imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
- imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
-
- imag = _mm256_adds_epi16(imag1, imag2);
- imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
-
- result = _mm256_or_si256(real, imag);
-
- _mm256_storeu_si256((__m256i*)_out, result);
-
- _in_a += 8;
- _in_b += 8;
- _out += 8;
- }
+ __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
+
+ const __m256i mask_imag = _mm256_set_epi8(0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0);
+ const __m256i mask_real = _mm256_set_epi8(0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF);
+
+ for (; number < avx2_points; number++) {
+ a = _mm256_loadu_si256(
+ (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ b = _mm256_loadu_si256(
+ (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
+ c = _mm256_mullo_epi16(a, b);
+
+ c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in
+ // zeros, and store the results in dst.
+ real = _mm256_subs_epi16(c, c_sr);
+ real = _mm256_and_si256(
+ real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
+
+ b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
+ a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
+
+ imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+ imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+ imag = _mm256_adds_epi16(imag1, imag2);
+ imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
+
+ result = _mm256_or_si256(real, imag);
+
+ _mm256_storeu_si256((__m256i*)_out, result);
+
+ _in_a += 8;
+ _in_b += 8;
+ _out += 8;
+ }
_mm256_zeroupper();
number = avx2_points * 8;
- for(;number < num_points; number++)
- {
- *_out++ = (*_in_a++) * (*_in_b++);
- }
+ for (; number < num_points; number++) {
+ *_out++ = (*_in_a++) * (*_in_b++);
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out,
+ const lv_16sc_t* in_a,
+ const lv_16sc_t* in_b,
+ unsigned int num_points)
{
unsigned int number = 0;
const unsigned int avx2_points = num_points / 8;
const lv_16sc_t* _in_b = in_b;
lv_16sc_t* _out = out;
- __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
-
- const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
- const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
-
- for(;number < avx2_points; number++)
- {
- a = _mm256_load_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
- b = _mm256_load_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
- c = _mm256_mullo_epi16(a, b);
-
- c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
- real = _mm256_subs_epi16(c, c_sr);
- real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
-
- b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
- a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
-
- imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
- imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
-
- imag = _mm256_adds_epi16(imag1, imag2);
- imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
-
- result = _mm256_or_si256(real, imag);
-
- _mm256_store_si256((__m256i*)_out, result);
-
- _in_a += 8;
- _in_b += 8;
- _out += 8;
- }
+ __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
+
+ const __m256i mask_imag = _mm256_set_epi8(0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0);
+ const __m256i mask_real = _mm256_set_epi8(0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF,
+ 0,
+ 0,
+ 0xFF,
+ 0xFF);
+
+ for (; number < avx2_points; number++) {
+ a = _mm256_load_si256(
+ (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ b = _mm256_load_si256(
+ (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
+ c = _mm256_mullo_epi16(a, b);
+
+ c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in
+ // zeros, and store the results in dst.
+ real = _mm256_subs_epi16(c, c_sr);
+ real = _mm256_and_si256(
+ real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
+
+ b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
+ a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
+
+ imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+ imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+ imag = _mm256_adds_epi16(imag1, imag2);
+ imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
+
+ result = _mm256_or_si256(real, imag);
+
+ _mm256_store_si256((__m256i*)_out, result);
+
+ _in_a += 8;
+ _in_b += 8;
+ _out += 8;
+ }
_mm256_zeroupper();
number = avx2_points * 8;
- for(;number < num_points; number++)
- {
- *_out++ = (*_in_a++) * (*_in_b++);
- }
+ for (; number < num_points; number++) {
+ *_out++ = (*_in_a++) * (*_in_b++);
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out,
+ const lv_16sc_t* in_a,
+ const lv_16sc_t* in_b,
+ unsigned int num_points)
{
- lv_16sc_t *a_ptr = (lv_16sc_t*) in_a;
- lv_16sc_t *b_ptr = (lv_16sc_t*) in_b;
+ lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
+ lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
unsigned int quarter_points = num_points / 4;
int16x4x2_t a_val, b_val, c_val;
int16x4x2_t tmp_real, tmp_imag;
unsigned int number = 0;
- for(number = 0; number < quarter_points; ++number)
- {
- a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
- b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
- __VOLK_PREFETCH(a_ptr + 4);
- __VOLK_PREFETCH(b_ptr + 4);
-
- // multiply the real*real and imag*imag to get real result
- // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
- tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
- // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
- tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
-
- // Multiply cross terms to get the imaginary result
- // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
- tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
- // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
- tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
-
- // store the results
- c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]);
- c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
- vst2_s16((int16_t*)out, c_val);
-
- a_ptr += 4;
- b_ptr += 4;
- out += 4;
- }
-
- for(number = quarter_points * 4; number < num_points; number++)
- {
- *out++ = (*a_ptr++) * (*b_ptr++);
- }
+ for (number = 0; number < quarter_points; ++number) {
+ a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ __VOLK_PREFETCH(a_ptr + 4);
+ __VOLK_PREFETCH(b_ptr + 4);
+
+ // multiply the real*real and imag*imag to get real result
+ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+ tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
+ // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+ tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
+
+ // Multiply cross terms to get the imaginary result
+ // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+ tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
+ // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+ tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
+
+ // store the results
+ c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]);
+ c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
+ vst2_s16((int16_t*)out, c_val);
+
+ a_ptr += 4;
+ b_ptr += 4;
+ out += 4;
+ }
+
+ for (number = quarter_points * 4; number < num_points; number++) {
+ *out++ = (*a_ptr++) * (*b_ptr++);
+ }
}
#endif /* LV_HAVE_NEON */
#if LV_HAVE_AVX2
#include <immintrin.h>
-static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int num_points){
- unsigned int number;
+static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int num_points)
+{
+ unsigned int number;
- const unsigned int nPerSet = 16;
- const uint64_t nSets = num_points / nPerSet;
+ const unsigned int nPerSet = 16;
+ const uint64_t nSets = num_points / nPerSet;
- uint16_t* inputPtr = (uint16_t*) intsToSwap;
+ uint16_t* inputPtr = (uint16_t*)intsToSwap;
- const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30};
+ const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
+ 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
+ 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
- const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]);
+ const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
- for(number = 0; number < nSets; number++) {
- // Load the 32t values, increment inputPtr later since we're doing it in-place.
- const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
- const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
+ for (number = 0; number < nSets; number++) {
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
+ const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
- // Store the results
- _mm256_store_si256((__m256i*)inputPtr, output);
- inputPtr += nPerSet;
- }
+ // Store the results
+ _mm256_store_si256((__m256i*)inputPtr, output);
+ inputPtr += nPerSet;
+ }
- _mm256_zeroupper();
+ _mm256_zeroupper();
- // Byteswap any remaining points:
- for(number = nPerSet * nSets; number < num_points; number++) {
- uint16_t outputVal = *inputPtr;
- outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
- *inputPtr = outputVal;
- inputPtr++;
- }
+ // Byteswap any remaining points:
+ for (number = nPerSet * nSets; number < num_points; number++) {
+ uint16_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
#if LV_HAVE_AVX2
#include <immintrin.h>
-static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int num_points){
- unsigned int number;
+static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int num_points)
+{
+ unsigned int number;
- const unsigned int nPerSet = 16;
- const uint64_t nSets = num_points / nPerSet;
+ const unsigned int nPerSet = 16;
+ const uint64_t nSets = num_points / nPerSet;
- uint16_t* inputPtr = (uint16_t*) intsToSwap;
+ uint16_t* inputPtr = (uint16_t*)intsToSwap;
- const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30};
+ const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
+ 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
+ 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
- const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]);
+ const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
- for (number = 0; number < nSets; number++) {
- // Load the 32t values, increment inputPtr later since we're doing it in-place.
- const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
- const __m256i output = _mm256_shuffle_epi8(input,myShuffle);
+ for (number = 0; number < nSets; number++) {
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
+ const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
- // Store the results
- _mm256_storeu_si256((__m256i*)inputPtr, output);
- inputPtr += nPerSet;
- }
+ // Store the results
+ _mm256_storeu_si256((__m256i*)inputPtr, output);
+ inputPtr += nPerSet;
+ }
- _mm256_zeroupper();
+ _mm256_zeroupper();
- // Byteswap any remaining points:
- for(number = nPerSet * nSets; number < num_points; number++) {
- uint16_t outputVal = *inputPtr;
- outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
- *inputPtr = outputVal;
- inputPtr++;
- }
+ // Byteswap any remaining points:
+ for (number = nPerSet * nSets; number < num_points; number++) {
+ uint16_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){
- unsigned int number = 0;
- uint16_t* inputPtr = intsToSwap;
- __m128i input, left, right, output;
-
- const unsigned int eighthPoints = num_points / 8;
- for(;number < eighthPoints; number++){
- // Load the 16t values, increment inputPtr later since we're doing it in-place.
- input = _mm_loadu_si128((__m128i*)inputPtr);
- // Do the two shifts
- left = _mm_slli_epi16(input, 8);
- right = _mm_srli_epi16(input, 8);
- // Or the left and right halves together
- output = _mm_or_si128(left, right);
- // Store the results
- _mm_storeu_si128((__m128i*)inputPtr, output);
- inputPtr += 8;
- }
-
- // Byteswap any remaining points:
- number = eighthPoints*8;
- for(; number < num_points; number++){
- uint16_t outputVal = *inputPtr;
- outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
- *inputPtr = outputVal;
- inputPtr++;
- }
+static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points)
+{
+ unsigned int number = 0;
+ uint16_t* inputPtr = intsToSwap;
+ __m128i input, left, right, output;
+
+ const unsigned int eighthPoints = num_points / 8;
+ for (; number < eighthPoints; number++) {
+ // Load the 16t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_loadu_si128((__m128i*)inputPtr);
+ // Do the two shifts
+ left = _mm_slli_epi16(input, 8);
+ right = _mm_srli_epi16(input, 8);
+ // Or the left and right halves together
+ output = _mm_or_si128(left, right);
+ // Store the results
+ _mm_storeu_si128((__m128i*)inputPtr, output);
+ inputPtr += 8;
+ }
+
+ // Byteswap any remaining points:
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ uint16_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
-static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap, unsigned int num_points){
- unsigned int point;
- uint16_t* inputPtr = intsToSwap;
- for(point = 0; point < num_points; point++){
- uint16_t output = *inputPtr;
- output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
- *inputPtr = output;
- inputPtr++;
- }
+static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap,
+ unsigned int num_points)
+{
+ unsigned int point;
+ uint16_t* inputPtr = intsToSwap;
+ for (point = 0; point < num_points; point++) {
+ uint16_t output = *inputPtr;
+ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+ *inputPtr = output;
+ inputPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points){
- unsigned int number = 0;
- uint16_t* inputPtr = intsToSwap;
- __m128i input, left, right, output;
-
- const unsigned int eighthPoints = num_points / 8;
- for(;number < eighthPoints; number++){
- // Load the 16t values, increment inputPtr later since we're doing it in-place.
- input = _mm_load_si128((__m128i*)inputPtr);
- // Do the two shifts
- left = _mm_slli_epi16(input, 8);
- right = _mm_srli_epi16(input, 8);
- // Or the left and right halves together
- output = _mm_or_si128(left, right);
- // Store the results
- _mm_store_si128((__m128i*)inputPtr, output);
- inputPtr += 8;
- }
-
-
- // Byteswap any remaining points:
- number = eighthPoints*8;
- for(; number < num_points; number++){
- uint16_t outputVal = *inputPtr;
- outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
- *inputPtr = outputVal;
- inputPtr++;
- }
+static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points)
+{
+ unsigned int number = 0;
+ uint16_t* inputPtr = intsToSwap;
+ __m128i input, left, right, output;
+
+ const unsigned int eighthPoints = num_points / 8;
+ for (; number < eighthPoints; number++) {
+ // Load the 16t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_load_si128((__m128i*)inputPtr);
+ // Do the two shifts
+ left = _mm_slli_epi16(input, 8);
+ right = _mm_srli_epi16(input, 8);
+ // Or the left and right halves together
+ output = _mm_or_si128(left, right);
+ // Store the results
+ _mm_store_si128((__m128i*)inputPtr, output);
+ inputPtr += 8;
+ }
+
+
+ // Byteswap any remaining points:
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ uint16_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points){
- unsigned int number;
- unsigned int eighth_points = num_points / 8;
- uint16x8_t input, output;
- uint16_t* inputPtr = intsToSwap;
-
- for(number = 0; number < eighth_points; number++) {
- input = vld1q_u16(inputPtr);
- output = vsriq_n_u16(output, input, 8);
- output = vsliq_n_u16(output, input, 8);
- vst1q_u16(inputPtr, output);
- inputPtr += 8;
- }
-
- for(number = eighth_points * 8; number < num_points; number++){
- uint16_t output = *inputPtr;
- output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
- *inputPtr = output;
- inputPtr++;
- }
+static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points)
+{
+ unsigned int number;
+ unsigned int eighth_points = num_points / 8;
+ uint16x8_t input, output;
+ uint16_t* inputPtr = intsToSwap;
+
+ for (number = 0; number < eighth_points; number++) {
+ input = vld1q_u16(inputPtr);
+ output = vsriq_n_u16(output, input, 8);
+ output = vsliq_n_u16(output, input, 8);
+ vst1q_u16(inputPtr, output);
+ inputPtr += 8;
+ }
+
+ for (number = eighth_points * 8; number < num_points; number++) {
+ uint16_t output = *inputPtr;
+ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+ *inputPtr = output;
+ inputPtr++;
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap, unsigned int num_points){
- uint16_t* inputPtr = intsToSwap;
- unsigned int number = 0;
- unsigned int n16points = num_points / 16;
-
- uint8x8x4_t input_table;
- uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
- uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
-
- /* these magic numbers are used as byte-indices in the LUT.
- they are pre-computed to save time. A simple C program
- can calculate them; for example for lookup01:
- uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
- for(ii=0; ii < 8; ++ii) {
- index += ((uint64_t)(*(chars+ii))) << (ii*8);
+static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap,
+ unsigned int num_points)
+{
+ uint16_t* inputPtr = intsToSwap;
+ unsigned int number = 0;
+ unsigned int n16points = num_points / 16;
+
+ uint8x8x4_t input_table;
+ uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
+ uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
+
+ /* these magic numbers are used as byte-indices in the LUT.
+ they are pre-computed to save time. A simple C program
+ can calculate them; for example for lookup01:
+ uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
+ for(ii=0; ii < 8; ++ii) {
+ index += ((uint64_t)(*(chars+ii))) << (ii*8);
+ }
+ */
+ int_lookup01 = vcreate_u8(1232017111498883080);
+ int_lookup23 = vcreate_u8(1376697457175036426);
+ int_lookup45 = vcreate_u8(1521377802851189772);
+ int_lookup67 = vcreate_u8(1666058148527343118);
+
+ for (number = 0; number < n16points; ++number) {
+ input_table = vld4_u8((uint8_t*)inputPtr);
+ swapped_int01 = vtbl4_u8(input_table, int_lookup01);
+ swapped_int23 = vtbl4_u8(input_table, int_lookup23);
+ swapped_int45 = vtbl4_u8(input_table, int_lookup45);
+ swapped_int67 = vtbl4_u8(input_table, int_lookup67);
+ vst1_u8((uint8_t*)inputPtr, swapped_int01);
+ vst1_u8((uint8_t*)(inputPtr + 4), swapped_int23);
+ vst1_u8((uint8_t*)(inputPtr + 8), swapped_int45);
+ vst1_u8((uint8_t*)(inputPtr + 12), swapped_int67);
+
+ inputPtr += 16;
+ }
+
+ for (number = n16points * 16; number < num_points; ++number) {
+ uint16_t output = *inputPtr;
+ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+ *inputPtr = output;
+ inputPtr++;
}
- */
- int_lookup01 = vcreate_u8(1232017111498883080);
- int_lookup23 = vcreate_u8(1376697457175036426);
- int_lookup45 = vcreate_u8(1521377802851189772);
- int_lookup67 = vcreate_u8(1666058148527343118);
-
- for(number = 0; number < n16points; ++number){
- input_table = vld4_u8((uint8_t*) inputPtr);
- swapped_int01 = vtbl4_u8(input_table, int_lookup01);
- swapped_int23 = vtbl4_u8(input_table, int_lookup23);
- swapped_int45 = vtbl4_u8(input_table, int_lookup45);
- swapped_int67 = vtbl4_u8(input_table, int_lookup67);
- vst1_u8((uint8_t*)inputPtr, swapped_int01);
- vst1_u8((uint8_t*)(inputPtr+4), swapped_int23);
- vst1_u8((uint8_t*)(inputPtr+8), swapped_int45);
- vst1_u8((uint8_t*)(inputPtr+12), swapped_int67);
-
- inputPtr += 16;
- }
-
- for(number = n16points * 16; number < num_points; ++number){
- uint16_t output = *inputPtr;
- output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
- *inputPtr = output;
- inputPtr++;
- }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
-static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap, unsigned int num_points){
- unsigned int point;
- uint16_t* inputPtr = intsToSwap;
- for(point = 0; point < num_points; point++){
- uint16_t output = *inputPtr;
- output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
- *inputPtr = output;
- inputPtr++;
- }
+static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap,
+ unsigned int num_points)
+{
+ unsigned int point;
+ uint16_t* inputPtr = intsToSwap;
+ for (point = 0; point < num_points; point++) {
+ uint16_t output = *inputPtr;
+ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+ *inputPtr = output;
+ inputPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC
extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, unsigned int num_points);
-static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points){
+static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points)
+{
volk_16u_byteswap_a_orc_impl(intsToSwap, num_points);
}
#endif /* LV_HAVE_ORC */
#include <stdint.h>
-#include <volk/volk_16u_byteswap.h>
#include <string.h>
+#include <volk/volk_16u_byteswap.h>
#ifdef LV_HAVE_GENERIC
-static inline void volk_16u_byteswappuppet_16u_generic(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){
+static inline void volk_16u_byteswappuppet_16u_generic(uint16_t* output,
+ uint16_t* intsToSwap,
+ unsigned int num_points)
+{
volk_16u_byteswap_generic((uint16_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
-
}
#endif
#ifdef LV_HAVE_NEON
-static inline void volk_16u_byteswappuppet_16u_neon(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){
+static inline void volk_16u_byteswappuppet_16u_neon(uint16_t* output,
+ uint16_t* intsToSwap,
+ unsigned int num_points)
+{
volk_16u_byteswap_neon((uint16_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
-
}
#endif
#ifdef LV_HAVE_NEON
-static inline void volk_16u_byteswappuppet_16u_neon_table(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){
+static inline void volk_16u_byteswappuppet_16u_neon_table(uint16_t* output,
+ uint16_t* intsToSwap,
+ unsigned int num_points)
+{
volk_16u_byteswap_neon_table((uint16_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
-
}
#endif
#ifdef LV_HAVE_SSE2
-static inline void volk_16u_byteswappuppet_16u_u_sse2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){
+static inline void volk_16u_byteswappuppet_16u_u_sse2(uint16_t* output,
+ uint16_t* intsToSwap,
+ unsigned int num_points)
+{
volk_16u_byteswap_u_sse2((uint16_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
-
}
#endif
#ifdef LV_HAVE_SSE2
-static inline void volk_16u_byteswappuppet_16u_a_sse2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){
+static inline void volk_16u_byteswappuppet_16u_a_sse2(uint16_t* output,
+ uint16_t* intsToSwap,
+ unsigned int num_points)
+{
volk_16u_byteswap_a_sse2((uint16_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
-
}
#endif
#ifdef LV_HAVE_AVX2
-static inline void volk_16u_byteswappuppet_16u_u_avx2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){
+static inline void volk_16u_byteswappuppet_16u_u_avx2(uint16_t* output,
+ uint16_t* intsToSwap,
+ unsigned int num_points)
+{
volk_16u_byteswap_u_avx2((uint16_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
-
}
#endif
#ifdef LV_HAVE_AVX2
-static inline void volk_16u_byteswappuppet_16u_a_avx2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){
+static inline void volk_16u_byteswappuppet_16u_a_avx2(uint16_t* output,
+ uint16_t* intsToSwap,
+ unsigned int num_points)
+{
volk_16u_byteswap_a_avx2((uint16_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
-
}
#endif
#ifdef LV_HAVE_GENERIC
-static inline void volk_32f_64f_add_64f_generic(double *cVector,
- const float *aVector,
- const double *bVector,
- unsigned int num_points) {
- double *cPtr = cVector;
- const float *aPtr = aVector;
- const double *bPtr = bVector;
- unsigned int number = 0;
-
- for (number = 0; number < num_points; number++) {
- *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
- }
+static inline void volk_32f_64f_add_64f_generic(double* cVector,
+ const float* aVector,
+ const double* bVector,
+ unsigned int num_points)
+{
+ double* cPtr = cVector;
+ const float* aPtr = aVector;
+ const double* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_NEONV8
#include <arm_neon.h>
-static inline void volk_32f_64f_add_64f_neon(double *cVector,
- const float *aVector,
- const double *bVector,
- unsigned int num_points) {
- unsigned int number = 0;
- const unsigned int half_points = num_points / 2;
-
- double *cPtr = cVector;
- const float *aPtr = aVector;
- const double *bPtr = bVector;
-
- float64x2_t aVal, bVal, cVal;
- float32x2_t aVal1;
- for (number = 0; number < half_points; number++) {
- // Load in to NEON registers
- aVal1 = vld1_f32(aPtr);
- bVal = vld1q_f64(bPtr);
- __VOLK_PREFETCH(aPtr + 2);
- __VOLK_PREFETCH(bPtr + 2);
- aPtr += 2; // q uses quadwords, 4 floats per vadd
- bPtr += 2;
-
- // Vector conversion
- aVal = vcvt_f64_f32(aVal1);
- // vector add
- cVal = vaddq_f64(aVal, bVal);
- // Store the results back into the C container
- vst1q_f64(cPtr, cVal);
-
- cPtr += 2;
- }
-
- number = half_points * 2; // should be = num_points
- for (; number < num_points; number++) {
- *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
- }
+static inline void volk_32f_64f_add_64f_neon(double* cVector,
+ const float* aVector,
+ const double* bVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int half_points = num_points / 2;
+
+ double* cPtr = cVector;
+ const float* aPtr = aVector;
+ const double* bPtr = bVector;
+
+ float64x2_t aVal, bVal, cVal;
+ float32x2_t aVal1;
+ for (number = 0; number < half_points; number++) {
+ // Load in to NEON registers
+ aVal1 = vld1_f32(aPtr);
+ bVal = vld1q_f64(bPtr);
+ __VOLK_PREFETCH(aPtr + 2);
+ __VOLK_PREFETCH(bPtr + 2);
+ aPtr += 2; // q uses quadwords, 4 floats per vadd
+ bPtr += 2;
+
+ // Vector conversion
+ aVal = vcvt_f64_f32(aVal1);
+ // vector add
+ cVal = vaddq_f64(aVal, bVal);
+ // Store the results back into the C container
+ vst1q_f64(cPtr, cVal);
+
+ cPtr += 2;
+ }
+
+ number = half_points * 2; // should be = num_points
+ for (; number < num_points; number++) {
+ *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_NEONV8 */
#include <immintrin.h>
#include <xmmintrin.h>
-static inline void volk_32f_64f_add_64f_u_avx(double *cVector,
- const float *aVector,
- const double *bVector,
- unsigned int num_points) {
- unsigned int number = 0;
- const unsigned int eighth_points = num_points / 8;
-
- double *cPtr = cVector;
- const float *aPtr = aVector;
- const double *bPtr = bVector;
-
- __m256 aVal;
- __m128 aVal1, aVal2;
- __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
- for (; number < eighth_points; number++) {
-
- aVal = _mm256_loadu_ps(aPtr);
- bVal1 = _mm256_loadu_pd(bPtr);
- bVal2 = _mm256_loadu_pd(bPtr + 4);
-
- aVal1 = _mm256_extractf128_ps(aVal, 0);
- aVal2 = _mm256_extractf128_ps(aVal, 1);
-
- aDbl1 = _mm256_cvtps_pd(aVal1);
- aDbl2 = _mm256_cvtps_pd(aVal2);
-
- cVal1 = _mm256_add_pd(aDbl1, bVal1);
- cVal2 = _mm256_add_pd(aDbl2, bVal2);
-
- _mm256_storeu_pd(cPtr,
- cVal1); // Store the results back into the C container
- _mm256_storeu_pd(cPtr + 4,
- cVal2); // Store the results back into the C container
-
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
-
- number = eighth_points * 8;
- for (; number < num_points; number++) {
- *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
- }
+static inline void volk_32f_64f_add_64f_u_avx(double* cVector,
+ const float* aVector,
+ const double* bVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eighth_points = num_points / 8;
+
+ double* cPtr = cVector;
+ const float* aPtr = aVector;
+ const double* bPtr = bVector;
+
+ __m256 aVal;
+ __m128 aVal1, aVal2;
+ __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
+ for (; number < eighth_points; number++) {
+
+ aVal = _mm256_loadu_ps(aPtr);
+ bVal1 = _mm256_loadu_pd(bPtr);
+ bVal2 = _mm256_loadu_pd(bPtr + 4);
+
+ aVal1 = _mm256_extractf128_ps(aVal, 0);
+ aVal2 = _mm256_extractf128_ps(aVal, 1);
+
+ aDbl1 = _mm256_cvtps_pd(aVal1);
+ aDbl2 = _mm256_cvtps_pd(aVal2);
+
+ cVal1 = _mm256_add_pd(aDbl1, bVal1);
+ cVal2 = _mm256_add_pd(aDbl2, bVal2);
+
+ _mm256_storeu_pd(cPtr,
+ cVal1); // Store the results back into the C container
+ _mm256_storeu_pd(cPtr + 4,
+ cVal2); // Store the results back into the C container
+
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighth_points * 8;
+ for (; number < num_points; number++) {
+ *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
#include <immintrin.h>
#include <xmmintrin.h>
-static inline void volk_32f_64f_add_64f_a_avx(double *cVector,
- const float *aVector,
- const double *bVector,
- unsigned int num_points) {
- unsigned int number = 0;
- const unsigned int eighth_points = num_points / 8;
-
- double *cPtr = cVector;
- const float *aPtr = aVector;
- const double *bPtr = bVector;
-
- __m256 aVal;
- __m128 aVal1, aVal2;
- __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
- for (; number < eighth_points; number++) {
-
- aVal = _mm256_load_ps(aPtr);
- bVal1 = _mm256_load_pd(bPtr);
- bVal2 = _mm256_load_pd(bPtr + 4);
-
- aVal1 = _mm256_extractf128_ps(aVal, 0);
- aVal2 = _mm256_extractf128_ps(aVal, 1);
-
- aDbl1 = _mm256_cvtps_pd(aVal1);
- aDbl2 = _mm256_cvtps_pd(aVal2);
-
- cVal1 = _mm256_add_pd(aDbl1, bVal1);
- cVal2 = _mm256_add_pd(aDbl2, bVal2);
-
- _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
- _mm256_store_pd(cPtr + 4,
- cVal2); // Store the results back into the C container
-
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
-
- number = eighth_points * 8;
- for (; number < num_points; number++) {
- *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
- }
+static inline void volk_32f_64f_add_64f_a_avx(double* cVector,
+ const float* aVector,
+ const double* bVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eighth_points = num_points / 8;
+
+ double* cPtr = cVector;
+ const float* aPtr = aVector;
+ const double* bPtr = bVector;
+
+ __m256 aVal;
+ __m128 aVal1, aVal2;
+ __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
+ for (; number < eighth_points; number++) {
+
+ aVal = _mm256_load_ps(aPtr);
+ bVal1 = _mm256_load_pd(bPtr);
+ bVal2 = _mm256_load_pd(bPtr + 4);
+
+ aVal1 = _mm256_extractf128_ps(aVal, 0);
+ aVal2 = _mm256_extractf128_ps(aVal, 1);
+
+ aDbl1 = _mm256_cvtps_pd(aVal1);
+ aDbl2 = _mm256_cvtps_pd(aVal2);
+
+ cVal1 = _mm256_add_pd(aDbl1, bVal1);
+ cVal2 = _mm256_add_pd(aDbl2, bVal2);
+
+ _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
+ _mm256_store_pd(cPtr + 4,
+ cVal2); // Store the results back into the C container
+
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighth_points * 8;
+ for (; number < num_points; number++) {
+ *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_64f_multiply_64f(double* cVector, const double* aVector, const double* bVector, unsigned int num_points)
- * \endcode
+ * void volk_32f_64f_multiply_64f(double* cVector, const double* aVector, const double*
+ * bVector, unsigned int num_points) \endcode
*
* \b Inputs
* \li aVector: First input vector.
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_64f_multiply_64f_generic(double *cVector, const float *aVector,
- const double *bVector, unsigned int num_points)
+static inline void volk_32f_64f_multiply_64f_generic(double* cVector,
+ const float* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- double *cPtr = cVector;
- const float *aPtr = aVector;
- const double *bPtr = bVector;
- unsigned int number = 0;
-
- for (number = 0; number < num_points; number++) {
- *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
- }
+ double* cPtr = cVector;
+ const float* aPtr = aVector;
+ const double* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#include <immintrin.h>
#include <xmmintrin.h>
-static inline void
-volk_32f_64f_multiply_64f_u_avx(double *cVector, const float *aVector,
- const double *bVector, unsigned int num_points)
+static inline void volk_32f_64f_multiply_64f_u_avx(double* cVector,
+ const float* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighth_points = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighth_points = num_points / 8;
- double *cPtr = cVector;
- const float *aPtr = aVector;
- const double *bPtr = bVector;
+ double* cPtr = cVector;
+ const float* aPtr = aVector;
+ const double* bPtr = bVector;
- __m256 aVal;
- __m128 aVal1, aVal2;
- __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
- for (; number < eighth_points; number++) {
+ __m256 aVal;
+ __m128 aVal1, aVal2;
+ __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
+ for (; number < eighth_points; number++) {
- aVal = _mm256_loadu_ps(aPtr);
- bVal1 = _mm256_loadu_pd(bPtr);
- bVal2 = _mm256_loadu_pd(bPtr+4);
+ aVal = _mm256_loadu_ps(aPtr);
+ bVal1 = _mm256_loadu_pd(bPtr);
+ bVal2 = _mm256_loadu_pd(bPtr + 4);
- aVal1 = _mm256_extractf128_ps(aVal, 0);
- aVal2 = _mm256_extractf128_ps(aVal, 1);
+ aVal1 = _mm256_extractf128_ps(aVal, 0);
+ aVal2 = _mm256_extractf128_ps(aVal, 1);
- aDbl1 = _mm256_cvtps_pd(aVal1);
- aDbl2 = _mm256_cvtps_pd(aVal2);
+ aDbl1 = _mm256_cvtps_pd(aVal1);
+ aDbl2 = _mm256_cvtps_pd(aVal2);
- cVal1 = _mm256_mul_pd(aDbl1, bVal1);
- cVal2 = _mm256_mul_pd(aDbl2, bVal2);
+ cVal1 = _mm256_mul_pd(aDbl1, bVal1);
+ cVal2 = _mm256_mul_pd(aDbl2, bVal2);
- _mm256_storeu_pd(cPtr, cVal1); // Store the results back into the C container
- _mm256_storeu_pd(cPtr+4, cVal2); // Store the results back into the C container
+ _mm256_storeu_pd(cPtr, cVal1); // Store the results back into the C container
+ _mm256_storeu_pd(cPtr + 4, cVal2); // Store the results back into the C container
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
- number = eighth_points * 8;
- for (; number < num_points; number++) {
- *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
- }
+ number = eighth_points * 8;
+ for (; number < num_points; number++) {
+ *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
#include <immintrin.h>
#include <xmmintrin.h>
-static inline void
-volk_32f_64f_multiply_64f_a_avx(double *cVector, const float *aVector,
- const double *bVector, unsigned int num_points)
+static inline void volk_32f_64f_multiply_64f_a_avx(double* cVector,
+ const float* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighth_points = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighth_points = num_points / 8;
- double *cPtr = cVector;
- const float *aPtr = aVector;
- const double *bPtr = bVector;
+ double* cPtr = cVector;
+ const float* aPtr = aVector;
+ const double* bPtr = bVector;
- __m256 aVal;
- __m128 aVal1, aVal2;
- __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
- for (; number < eighth_points; number++) {
+ __m256 aVal;
+ __m128 aVal1, aVal2;
+ __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
+ for (; number < eighth_points; number++) {
- aVal = _mm256_load_ps(aPtr);
- bVal1 = _mm256_load_pd(bPtr);
- bVal2 = _mm256_load_pd(bPtr+4);
+ aVal = _mm256_load_ps(aPtr);
+ bVal1 = _mm256_load_pd(bPtr);
+ bVal2 = _mm256_load_pd(bPtr + 4);
- aVal1 = _mm256_extractf128_ps(aVal, 0);
- aVal2 = _mm256_extractf128_ps(aVal, 1);
+ aVal1 = _mm256_extractf128_ps(aVal, 0);
+ aVal2 = _mm256_extractf128_ps(aVal, 1);
- aDbl1 = _mm256_cvtps_pd(aVal1);
- aDbl2 = _mm256_cvtps_pd(aVal2);
+ aDbl1 = _mm256_cvtps_pd(aVal1);
+ aDbl2 = _mm256_cvtps_pd(aVal2);
- cVal1 = _mm256_mul_pd(aDbl1, bVal1);
- cVal2 = _mm256_mul_pd(aDbl2, bVal2);
+ cVal1 = _mm256_mul_pd(aDbl1, bVal1);
+ cVal2 = _mm256_mul_pd(aDbl2, bVal2);
- _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
- _mm256_store_pd(cPtr+4, cVal2); // Store the results back into the C container
+ _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
+ _mm256_store_pd(cPtr + 4, cVal2); // Store the results back into the C container
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
- number = eighth_points * 8;
- for (; number < num_points; number++) {
- *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
- }
+ number = eighth_points * 8;
+ for (; number < num_points; number++) {
+ *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
-
#endif /* INCLUDED_volk_32f_64f_multiply_64f_u_H */
* int frame_exp = 10;
* int frame_size = 0x01 << frame_exp;
*
- * float* llrs = (float*) volk_malloc(sizeof(float) * frame_size * (frame_exp + 1), volk_get_alignment());
- * unsigned char* u = (unsigned char) volk_malloc(sizeof(unsigned char) * frame_size * (frame_exp + 1), volk_get_alignment());
+ * float* llrs = (float*) volk_malloc(sizeof(float) * frame_size * (frame_exp + 1),
+ * volk_get_alignment()); unsigned char* u = (unsigned char) volk_malloc(sizeof(unsigned
+ * char) * frame_size * (frame_exp + 1), volk_get_alignment());
*
- * {some_function_to_write_encoded_bits_to_float_llrs(llrs + frame_size * frame_exp, data)};
+ * {some_function_to_write_encoded_bits_to_float_llrs(llrs + frame_size * frame_exp,
+ * data)};
*
* unsigned int u_num;
* for(u_num = 0; u_num < frame_size; u_num++){
- * volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_size, frame_exp, 0, u_num, u_num);
+ * volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_size, frame_exp, 0, u_num,
+ * u_num);
* // next line could first search for frozen bit value and then do bit decision.
* u[u_num] = llrs[u_num] > 0 ? 0 : 1;
* }
#include <math.h>
#include <volk/volk_8u_x2_encodeframepolar_8u.h>
-static inline float
-llr_odd(const float la, const float lb)
+static inline float llr_odd(const float la, const float lb)
{
- const float ala = fabsf(la);
- const float alb = fabsf(lb);
- return copysignf(1.0f, la) * copysignf(1.0f, lb) * (ala > alb ? alb : ala);
+ const float ala = fabsf(la);
+ const float alb = fabsf(lb);
+ return copysignf(1.0f, la) * copysignf(1.0f, lb) * (ala > alb ? alb : ala);
}
-static inline void
-llr_odd_stages(float* llrs, int min_stage, const int depth, const int frame_size, const int row)
+static inline void llr_odd_stages(
+ float* llrs, int min_stage, const int depth, const int frame_size, const int row)
{
- int loop_stage = depth - 1;
- float* dst_llr_ptr;
- float* src_llr_ptr;
- int stage_size = 0x01 << loop_stage;
-
- int el;
- while(min_stage <= loop_stage){
- dst_llr_ptr = llrs + loop_stage * frame_size + row;
- src_llr_ptr = dst_llr_ptr + frame_size;
- for(el = 0; el < stage_size; el++){
- *dst_llr_ptr++ = llr_odd(*src_llr_ptr, *(src_llr_ptr + 1));
- src_llr_ptr += 2;
+ int loop_stage = depth - 1;
+ float* dst_llr_ptr;
+ float* src_llr_ptr;
+ int stage_size = 0x01 << loop_stage;
+
+ int el;
+ while (min_stage <= loop_stage) {
+ dst_llr_ptr = llrs + loop_stage * frame_size + row;
+ src_llr_ptr = dst_llr_ptr + frame_size;
+ for (el = 0; el < stage_size; el++) {
+ *dst_llr_ptr++ = llr_odd(*src_llr_ptr, *(src_llr_ptr + 1));
+ src_llr_ptr += 2;
+ }
+
+ --loop_stage;
+ stage_size >>= 1;
}
-
- --loop_stage;
- stage_size >>= 1;
- }
}
-static inline float
-llr_even(const float la, const float lb, const unsigned char f)
+static inline float llr_even(const float la, const float lb, const unsigned char f)
{
- switch(f){
+ switch (f) {
case 0:
- return lb + la;
+ return lb + la;
default:
- return lb - la;
- }
+ return lb - la;
+ }
}
static inline void
even_u_values(unsigned char* u_even, const unsigned char* u, const int u_num)
{
- u++;
- int i;
- for(i = 1; i < u_num; i += 2){
- *u_even++ = *u;
- u += 2;
- }
+ u++;
+ int i;
+ for (i = 1; i < u_num; i += 2) {
+ *u_even++ = *u;
+ u += 2;
+ }
}
static inline void
odd_xor_even_values(unsigned char* u_xor, const unsigned char* u, const int u_num)
{
- int i;
- for(i = 1; i < u_num; i += 2){
- *u_xor++ = *u ^ *(u + 1);
- u += 2;
- }
+ int i;
+ for (i = 1; i < u_num; i += 2) {
+ *u_xor++ = *u ^ *(u + 1);
+ u += 2;
+ }
}
-static inline int
-calculate_max_stage_depth_for_row(const int frame_exp, const int row)
+static inline int calculate_max_stage_depth_for_row(const int frame_exp, const int row)
{
- int max_stage_depth = 0;
- int half_stage_size = 0x01;
- int stage_size = half_stage_size << 1;
- while(max_stage_depth < (frame_exp - 1)){ // last stage holds received values.
- if(!(row % stage_size < half_stage_size)){
- break;
+ int max_stage_depth = 0;
+ int half_stage_size = 0x01;
+ int stage_size = half_stage_size << 1;
+ while (max_stage_depth < (frame_exp - 1)) { // last stage holds received values.
+ if (!(row % stage_size < half_stage_size)) {
+ break;
+ }
+ half_stage_size <<= 1;
+ stage_size <<= 1;
+ max_stage_depth++;
}
- half_stage_size <<= 1;
- stage_size <<= 1;
- max_stage_depth++;
- }
- return max_stage_depth;
+ return max_stage_depth;
}
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_8u_polarbutterfly_32f_generic(float* llrs, unsigned char* u,
- const int frame_exp,
- const int stage, const int u_num, const int row)
+static inline void volk_32f_8u_polarbutterfly_32f_generic(float* llrs,
+ unsigned char* u,
+ const int frame_exp,
+ const int stage,
+ const int u_num,
+ const int row)
{
- const int frame_size = 0x01 << frame_exp;
- const int next_stage = stage + 1;
+ const int frame_size = 0x01 << frame_exp;
+ const int next_stage = stage + 1;
- const int half_stage_size = 0x01 << stage;
- const int stage_size = half_stage_size << 1;
+ const int half_stage_size = 0x01 << stage;
+ const int stage_size = half_stage_size << 1;
- const bool is_upper_stage_half = row % stage_size < half_stage_size;
+ const bool is_upper_stage_half = row % stage_size < half_stage_size;
-// // this is a natural bit order impl
- float* next_llrs = llrs + frame_size;// LLRs are stored in a consecutive array.
- float* call_row_llr = llrs + row;
+ // // this is a natural bit order impl
+ float* next_llrs = llrs + frame_size; // LLRs are stored in a consecutive array.
+ float* call_row_llr = llrs + row;
- const int section = row - (row % stage_size);
- const int jump_size = ((row % half_stage_size) << 1) % stage_size;
+ const int section = row - (row % stage_size);
+ const int jump_size = ((row % half_stage_size) << 1) % stage_size;
- const int next_upper_row = section + jump_size;
- const int next_lower_row = next_upper_row + 1;
+ const int next_upper_row = section + jump_size;
+ const int next_lower_row = next_upper_row + 1;
- const float* upper_right_llr_ptr = next_llrs + next_upper_row;
- const float* lower_right_llr_ptr = next_llrs + next_lower_row;
+ const float* upper_right_llr_ptr = next_llrs + next_upper_row;
+ const float* lower_right_llr_ptr = next_llrs + next_lower_row;
- if(!is_upper_stage_half){
- const int u_pos = u_num >> stage;
- const unsigned char f = u[u_pos - 1];
- *call_row_llr = llr_even(*upper_right_llr_ptr, *lower_right_llr_ptr, f);
- return;
- }
+ if (!is_upper_stage_half) {
+ const int u_pos = u_num >> stage;
+ const unsigned char f = u[u_pos - 1];
+ *call_row_llr = llr_even(*upper_right_llr_ptr, *lower_right_llr_ptr, f);
+ return;
+ }
- if(frame_exp > next_stage){
- unsigned char* u_half = u + frame_size;
- odd_xor_even_values(u_half, u, u_num);
- volk_32f_8u_polarbutterfly_32f_generic(next_llrs, u_half, frame_exp, next_stage, u_num, next_upper_row);
+ if (frame_exp > next_stage) {
+ unsigned char* u_half = u + frame_size;
+ odd_xor_even_values(u_half, u, u_num);
+ volk_32f_8u_polarbutterfly_32f_generic(
+ next_llrs, u_half, frame_exp, next_stage, u_num, next_upper_row);
- even_u_values(u_half, u, u_num);
- volk_32f_8u_polarbutterfly_32f_generic(next_llrs, u_half, frame_exp, next_stage, u_num, next_lower_row);
- }
+ even_u_values(u_half, u, u_num);
+ volk_32f_8u_polarbutterfly_32f_generic(
+ next_llrs, u_half, frame_exp, next_stage, u_num, next_lower_row);
+ }
- *call_row_llr = llr_odd(*upper_right_llr_ptr, *lower_right_llr_ptr);
+ *call_row_llr = llr_odd(*upper_right_llr_ptr, *lower_right_llr_ptr);
}
#endif /* LV_HAVE_GENERIC */
#include <immintrin.h>
#include <volk/volk_avx_intrinsics.h>
-static inline void
-volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs, unsigned char* u,
- const int frame_exp,
- const int stage, const int u_num, const int row)
+static inline void volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs,
+ unsigned char* u,
+ const int frame_exp,
+ const int stage,
+ const int u_num,
+ const int row)
{
- const int frame_size = 0x01 << frame_exp;
- if(row % 2){ // for odd rows just do the only necessary calculation and return.
- const float* next_llrs = llrs + frame_size + row;
- *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
- return;
- }
-
- const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
- if(max_stage_depth < 3){ // vectorized version needs larger vectors.
- volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
- return;
- }
-
- int loop_stage = max_stage_depth;
- int stage_size = 0x01 << loop_stage;
-
- float* src_llr_ptr;
- float* dst_llr_ptr;
-
- __m256 src0, src1, dst;
-
- if(row){ // not necessary for ZERO row. == first bit to be decoded.
- // first do bit combination for all stages
- // effectively encode some decoded bits again.
- unsigned char* u_target = u + frame_size;
- unsigned char* u_temp = u + 2* frame_size;
- memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
-
- if(stage_size > 15){
- _mm256_zeroupper();
- volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
+ const int frame_size = 0x01 << frame_exp;
+ if (row % 2) { // for odd rows just do the only necessary calculation and return.
+ const float* next_llrs = llrs + frame_size + row;
+ *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
+ return;
}
- else{
- volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
+
+ const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
+ if (max_stage_depth < 3) { // vectorized version needs larger vectors.
+ volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
+ return;
}
- src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
- dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
+ int loop_stage = max_stage_depth;
+ int stage_size = 0x01 << loop_stage;
- __m128i fbits;
+ float* src_llr_ptr;
+ float* dst_llr_ptr;
- int p;
- for(p = 0; p < stage_size; p += 8){
- _mm256_zeroupper();
- fbits = _mm_loadu_si128((__m128i*) u_target);
- u_target += 8;
+ __m256 src0, src1, dst;
- src0 = _mm256_loadu_ps(src_llr_ptr);
- src1 = _mm256_loadu_ps(src_llr_ptr + 8);
- src_llr_ptr += 16;
+ if (row) { // not necessary for ZERO row. == first bit to be decoded.
+ // first do bit combination for all stages
+ // effectively encode some decoded bits again.
+ unsigned char* u_target = u + frame_size;
+ unsigned char* u_temp = u + 2 * frame_size;
+ memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
- dst = _mm256_polar_fsign_add_llrs(src0, src1, fbits);
+ if (stage_size > 15) {
+ _mm256_zeroupper();
+ volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
+ } else {
+ volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
+ }
- _mm256_storeu_ps(dst_llr_ptr, dst);
- dst_llr_ptr += 8;
- }
+ src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
+ dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
- --loop_stage;
- stage_size >>= 1;
- }
+ __m128i fbits;
- const int min_stage = stage > 2 ? stage : 2;
+ int p;
+ for (p = 0; p < stage_size; p += 8) {
+ _mm256_zeroupper();
+ fbits = _mm_loadu_si128((__m128i*)u_target);
+ u_target += 8;
- _mm256_zeroall(); // Important to clear cache!
+ src0 = _mm256_loadu_ps(src_llr_ptr);
+ src1 = _mm256_loadu_ps(src_llr_ptr + 8);
+ src_llr_ptr += 16;
- int el;
- while(min_stage < loop_stage){
- dst_llr_ptr = llrs + loop_stage * frame_size + row;
- src_llr_ptr = dst_llr_ptr + frame_size;
- for(el = 0; el < stage_size; el += 8){
- src0 = _mm256_loadu_ps(src_llr_ptr);
- src_llr_ptr += 8;
- src1 = _mm256_loadu_ps(src_llr_ptr);
- src_llr_ptr += 8;
+ dst = _mm256_polar_fsign_add_llrs(src0, src1, fbits);
- dst = _mm256_polar_minsum_llrs(src0, src1);
+ _mm256_storeu_ps(dst_llr_ptr, dst);
+ dst_llr_ptr += 8;
+ }
- _mm256_storeu_ps(dst_llr_ptr, dst);
- dst_llr_ptr += 8;
+ --loop_stage;
+ stage_size >>= 1;
}
- --loop_stage;
- stage_size >>= 1;
+ const int min_stage = stage > 2 ? stage : 2;
+
+ _mm256_zeroall(); // Important to clear cache!
- }
+ int el;
+ while (min_stage < loop_stage) {
+ dst_llr_ptr = llrs + loop_stage * frame_size + row;
+ src_llr_ptr = dst_llr_ptr + frame_size;
+ for (el = 0; el < stage_size; el += 8) {
+ src0 = _mm256_loadu_ps(src_llr_ptr);
+ src_llr_ptr += 8;
+ src1 = _mm256_loadu_ps(src_llr_ptr);
+ src_llr_ptr += 8;
- // for stages < 3 vectors are too small!.
- llr_odd_stages(llrs, stage, loop_stage + 1,frame_size, row);
+ dst = _mm256_polar_minsum_llrs(src0, src1);
+
+ _mm256_storeu_ps(dst_llr_ptr, dst);
+ dst_llr_ptr += 8;
+ }
+
+ --loop_stage;
+ stage_size >>= 1;
+ }
+
+ // for stages < 3 vectors are too small!.
+ llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row);
}
#endif /* LV_HAVE_AVX */
#include <immintrin.h>
#include <volk/volk_avx2_intrinsics.h>
-static inline void
-volk_32f_8u_polarbutterfly_32f_u_avx2(float* llrs, unsigned char* u,
- const int frame_exp,
- const int stage, const int u_num, const int row)
+static inline void volk_32f_8u_polarbutterfly_32f_u_avx2(float* llrs,
+ unsigned char* u,
+ const int frame_exp,
+ const int stage,
+ const int u_num,
+ const int row)
{
- const int frame_size = 0x01 << frame_exp;
- if(row % 2){ // for odd rows just do the only necessary calculation and return.
- const float* next_llrs = llrs + frame_size + row;
- *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
- return;
- }
-
- const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
- if(max_stage_depth < 3){ // vectorized version needs larger vectors.
- volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
- return;
- }
-
- int loop_stage = max_stage_depth;
- int stage_size = 0x01 << loop_stage;
-
- float* src_llr_ptr;
- float* dst_llr_ptr;
-
- __m256 src0, src1, dst;
-
- if(row){ // not necessary for ZERO row. == first bit to be decoded.
- // first do bit combination for all stages
- // effectively encode some decoded bits again.
- unsigned char* u_target = u + frame_size;
- unsigned char* u_temp = u + 2* frame_size;
- memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
-
- if(stage_size > 15){
- _mm256_zeroupper();
- volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
+ const int frame_size = 0x01 << frame_exp;
+ if (row % 2) { // for odd rows just do the only necessary calculation and return.
+ const float* next_llrs = llrs + frame_size + row;
+ *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
+ return;
}
- else{
- volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
+
+ const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
+ if (max_stage_depth < 3) { // vectorized version needs larger vectors.
+ volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
+ return;
}
- src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
- dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
+ int loop_stage = max_stage_depth;
+ int stage_size = 0x01 << loop_stage;
- __m128i fbits;
+ float* src_llr_ptr;
+ float* dst_llr_ptr;
- int p;
- for(p = 0; p < stage_size; p += 8){
- _mm256_zeroupper();
- fbits = _mm_loadu_si128((__m128i*) u_target);
- u_target += 8;
+ __m256 src0, src1, dst;
- src0 = _mm256_loadu_ps(src_llr_ptr);
- src1 = _mm256_loadu_ps(src_llr_ptr + 8);
- src_llr_ptr += 16;
+ if (row) { // not necessary for ZERO row. == first bit to be decoded.
+ // first do bit combination for all stages
+ // effectively encode some decoded bits again.
+ unsigned char* u_target = u + frame_size;
+ unsigned char* u_temp = u + 2 * frame_size;
+ memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
- dst = _mm256_polar_fsign_add_llrs_avx2(src0, src1, fbits);
+ if (stage_size > 15) {
+ _mm256_zeroupper();
+ volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
+ } else {
+ volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
+ }
- _mm256_storeu_ps(dst_llr_ptr, dst);
- dst_llr_ptr += 8;
- }
+ src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
+ dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
- --loop_stage;
- stage_size >>= 1;
- }
+ __m128i fbits;
- const int min_stage = stage > 2 ? stage : 2;
+ int p;
+ for (p = 0; p < stage_size; p += 8) {
+ _mm256_zeroupper();
+ fbits = _mm_loadu_si128((__m128i*)u_target);
+ u_target += 8;
- _mm256_zeroall(); // Important to clear cache!
+ src0 = _mm256_loadu_ps(src_llr_ptr);
+ src1 = _mm256_loadu_ps(src_llr_ptr + 8);
+ src_llr_ptr += 16;
- int el;
- while(min_stage < loop_stage){
- dst_llr_ptr = llrs + loop_stage * frame_size + row;
- src_llr_ptr = dst_llr_ptr + frame_size;
- for(el = 0; el < stage_size; el += 8){
- src0 = _mm256_loadu_ps(src_llr_ptr);
- src_llr_ptr += 8;
- src1 = _mm256_loadu_ps(src_llr_ptr);
- src_llr_ptr += 8;
+ dst = _mm256_polar_fsign_add_llrs_avx2(src0, src1, fbits);
- dst = _mm256_polar_minsum_llrs(src0, src1);
+ _mm256_storeu_ps(dst_llr_ptr, dst);
+ dst_llr_ptr += 8;
+ }
- _mm256_storeu_ps(dst_llr_ptr, dst);
- dst_llr_ptr += 8;
+ --loop_stage;
+ stage_size >>= 1;
}
- --loop_stage;
- stage_size >>= 1;
+ const int min_stage = stage > 2 ? stage : 2;
+
+ _mm256_zeroall(); // Important to clear cache!
+
+ int el;
+ while (min_stage < loop_stage) {
+ dst_llr_ptr = llrs + loop_stage * frame_size + row;
+ src_llr_ptr = dst_llr_ptr + frame_size;
+ for (el = 0; el < stage_size; el += 8) {
+ src0 = _mm256_loadu_ps(src_llr_ptr);
+ src_llr_ptr += 8;
+ src1 = _mm256_loadu_ps(src_llr_ptr);
+ src_llr_ptr += 8;
- }
+ dst = _mm256_polar_minsum_llrs(src0, src1);
+
+ _mm256_storeu_ps(dst_llr_ptr, dst);
+ dst_llr_ptr += 8;
+ }
+
+ --loop_stage;
+ stage_size >>= 1;
+ }
- // for stages < 3 vectors are too small!.
- llr_odd_stages(llrs, stage, loop_stage + 1,frame_size, row);
+ // for stages < 3 vectors are too small!.
+ llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row);
}
#endif /* LV_HAVE_AVX2 */
#include <volk/volk_8u_x3_encodepolarpuppet_8u.h>
-static inline void
-sanitize_bytes(unsigned char* u, const int elements)
+static inline void sanitize_bytes(unsigned char* u, const int elements)
{
- int i;
- unsigned char* u_ptr = u;
- for(i = 0; i < elements; i++){
- *u_ptr = (*u_ptr & 0x01);
- u_ptr++;
- }
+ int i;
+ unsigned char* u_ptr = u;
+ for (i = 0; i < elements; i++) {
+ *u_ptr = (*u_ptr & 0x01);
+ u_ptr++;
+ }
}
-static inline void
-clean_up_intermediate_values(float* llrs, unsigned char* u, const int frame_size, const int elements)
+static inline void clean_up_intermediate_values(float* llrs,
+ unsigned char* u,
+ const int frame_size,
+ const int elements)
{
- memset(u + frame_size, 0, sizeof(unsigned char) * (elements - frame_size));
- memset(llrs + frame_size, 0, sizeof(float) * (elements - frame_size));
+ memset(u + frame_size, 0, sizeof(unsigned char) * (elements - frame_size));
+ memset(llrs + frame_size, 0, sizeof(float) * (elements - frame_size));
}
static inline void
generate_error_free_input_vector(float* llrs, unsigned char* u, const int frame_size)
{
- memset(u, 0, frame_size);
- unsigned char* target = u + frame_size;
- volk_8u_x2_encodeframepolar_8u_generic(target, u + 2 * frame_size, frame_size);
- float* ft = llrs;
- int i;
- for(i = 0; i < frame_size; i++){
- *ft = (-2 * ((float) *target++)) + 1.0f;
- ft++;
- }
+ memset(u, 0, frame_size);
+ unsigned char* target = u + frame_size;
+ volk_8u_x2_encodeframepolar_8u_generic(target, u + 2 * frame_size, frame_size);
+ float* ft = llrs;
+ int i;
+ for (i = 0; i < frame_size; i++) {
+ *ft = (-2 * ((float)*target++)) + 1.0f;
+ ft++;
+ }
}
static inline void
print_llr_tree(const float* llrs, const int frame_size, const int frame_exp)
{
- int s, e;
- for(s = 0; s < frame_size; s++){
- for(e = 0; e < frame_exp + 1; e++){
- printf("%+4.2f ", llrs[e * frame_size + s]);
- }
- printf("\n");
- if((s + 1) % 8 == 0){
- printf("\n");
+ int s, e;
+ for (s = 0; s < frame_size; s++) {
+ for (e = 0; e < frame_exp + 1; e++) {
+ printf("%+4.2f ", llrs[e * frame_size + s]);
+ }
+ printf("\n");
+ if ((s + 1) % 8 == 0) {
+ printf("\n");
+ }
}
- }
}
-static inline int
-maximum_frame_size(const int elements)
+static inline int maximum_frame_size(const int elements)
{
- unsigned int frame_size = next_lower_power_of_two(elements);
- unsigned int frame_exp = log2_of_power_of_2(frame_size);
- return next_lower_power_of_two(frame_size / frame_exp);
+ unsigned int frame_size = next_lower_power_of_two(elements);
+ unsigned int frame_exp = log2_of_power_of_2(frame_size);
+ return next_lower_power_of_two(frame_size / frame_exp);
}
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_8u_polarbutterflypuppet_32f_generic(float* llrs, const float* input, unsigned char* u, const int elements)
+static inline void volk_32f_8u_polarbutterflypuppet_32f_generic(float* llrs,
+ const float* input,
+ unsigned char* u,
+ const int elements)
{
- unsigned int frame_size = maximum_frame_size(elements);
- unsigned int frame_exp = log2_of_power_of_2(frame_size);
+ unsigned int frame_size = maximum_frame_size(elements);
+ unsigned int frame_exp = log2_of_power_of_2(frame_size);
- sanitize_bytes(u, elements);
- clean_up_intermediate_values(llrs, u, frame_size, elements);
- generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
+ sanitize_bytes(u, elements);
+ clean_up_intermediate_values(llrs, u, frame_size, elements);
+ generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
- unsigned int u_num = 0;
- for(; u_num < frame_size; u_num++){
- volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, 0, u_num, u_num);
- u[u_num] = llrs[u_num] > 0 ? 0 : 1;
- }
+ unsigned int u_num = 0;
+ for (; u_num < frame_size; u_num++) {
+ volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, 0, u_num, u_num);
+ u[u_num] = llrs[u_num] > 0 ? 0 : 1;
+ }
- clean_up_intermediate_values(llrs, u, frame_size, elements);
+ clean_up_intermediate_values(llrs, u, frame_size, elements);
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_AVX
-static inline void
-volk_32f_8u_polarbutterflypuppet_32f_u_avx(float* llrs, const float* input, unsigned char* u, const int elements)
+static inline void volk_32f_8u_polarbutterflypuppet_32f_u_avx(float* llrs,
+ const float* input,
+ unsigned char* u,
+ const int elements)
{
- unsigned int frame_size = maximum_frame_size(elements);
- unsigned int frame_exp = log2_of_power_of_2(frame_size);
+ unsigned int frame_size = maximum_frame_size(elements);
+ unsigned int frame_exp = log2_of_power_of_2(frame_size);
- sanitize_bytes(u, elements);
- clean_up_intermediate_values(llrs, u, frame_size, elements);
- generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
+ sanitize_bytes(u, elements);
+ clean_up_intermediate_values(llrs, u, frame_size, elements);
+ generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
- unsigned int u_num = 0;
- for(; u_num < frame_size; u_num++){
- volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_exp, 0, u_num, u_num);
- u[u_num] = llrs[u_num] > 0 ? 0 : 1;
- }
+ unsigned int u_num = 0;
+ for (; u_num < frame_size; u_num++) {
+ volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_exp, 0, u_num, u_num);
+ u[u_num] = llrs[u_num] > 0 ? 0 : 1;
+ }
- clean_up_intermediate_values(llrs, u, frame_size, elements);
+ clean_up_intermediate_values(llrs, u, frame_size, elements);
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_AVX2
-static inline void
-volk_32f_8u_polarbutterflypuppet_32f_u_avx2(float* llrs, const float* input, unsigned char* u, const int elements)
+static inline void volk_32f_8u_polarbutterflypuppet_32f_u_avx2(float* llrs,
+ const float* input,
+ unsigned char* u,
+ const int elements)
{
- unsigned int frame_size = maximum_frame_size(elements);
- unsigned int frame_exp = log2_of_power_of_2(frame_size);
+ unsigned int frame_size = maximum_frame_size(elements);
+ unsigned int frame_exp = log2_of_power_of_2(frame_size);
- sanitize_bytes(u, elements);
- clean_up_intermediate_values(llrs, u, frame_size, elements);
- generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
+ sanitize_bytes(u, elements);
+ clean_up_intermediate_values(llrs, u, frame_size, elements);
+ generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
- unsigned int u_num = 0;
- for(; u_num < frame_size; u_num++){
- volk_32f_8u_polarbutterfly_32f_u_avx2(llrs, u, frame_exp, 0, u_num, u_num);
- u[u_num] = llrs[u_num] > 0 ? 0 : 1;
- }
+ unsigned int u_num = 0;
+ for (; u_num < frame_size; u_num++) {
+ volk_32f_8u_polarbutterfly_32f_u_avx2(llrs, u, frame_exp, 0, u_num, u_num);
+ u[u_num] = llrs[u_num] > 0 ? 0 : 1;
+ }
- clean_up_intermediate_values(llrs, u, frame_size, elements);
+ clean_up_intermediate_values(llrs, u, frame_size, elements);
}
#endif /* LV_HAVE_AVX2 */
-
#endif /* VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLYPUPPET_32F_H_ */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_accumulator_s32f(float* result, const float* inputBuffer, unsigned int num_points)
- * \endcode
+ * void volk_32f_accumulator_s32f(float* result, const float* inputBuffer, unsigned int
+ * num_points) \endcode
*
* \b Inputs
* \li inputBuffer The buffer of data to be accumulated
#ifndef INCLUDED_volk_32f_accumulator_s32f_a_H
#define INCLUDED_volk_32f_accumulator_s32f_a_H
-#include <volk/volk_common.h>
#include <inttypes.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_accumulator_s32f_a_avx(float* result, const float* inputBuffer, unsigned int num_points)
+static inline void volk_32f_accumulator_s32f_a_avx(float* result,
+ const float* inputBuffer,
+ unsigned int num_points)
{
- float returnValue = 0;
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- const float* aPtr = inputBuffer;
- __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
-
- __m256 accumulator = _mm256_setzero_ps();
- __m256 aVal = _mm256_setzero_ps();
-
- for(;number < eighthPoints; number++){
- aVal = _mm256_load_ps(aPtr);
- accumulator = _mm256_add_ps(accumulator, aVal);
- aPtr += 8;
- }
-
- _mm256_store_ps(tempBuffer, accumulator);
-
- returnValue = tempBuffer[0];
- returnValue += tempBuffer[1];
- returnValue += tempBuffer[2];
- returnValue += tempBuffer[3];
- returnValue += tempBuffer[4];
- returnValue += tempBuffer[5];
- returnValue += tempBuffer[6];
- returnValue += tempBuffer[7];
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- returnValue += (*aPtr++);
- }
- *result = returnValue;
+ float returnValue = 0;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* aPtr = inputBuffer;
+ __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
+
+ __m256 accumulator = _mm256_setzero_ps();
+ __m256 aVal = _mm256_setzero_ps();
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_load_ps(aPtr);
+ accumulator = _mm256_add_ps(accumulator, aVal);
+ aPtr += 8;
+ }
+
+ _mm256_store_ps(tempBuffer, accumulator);
+
+ returnValue = tempBuffer[0];
+ returnValue += tempBuffer[1];
+ returnValue += tempBuffer[2];
+ returnValue += tempBuffer[3];
+ returnValue += tempBuffer[4];
+ returnValue += tempBuffer[5];
+ returnValue += tempBuffer[6];
+ returnValue += tempBuffer[7];
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ returnValue += (*aPtr++);
+ }
+ *result = returnValue;
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_accumulator_s32f_u_avx(float* result, const float* inputBuffer, unsigned int num_points)
+static inline void volk_32f_accumulator_s32f_u_avx(float* result,
+ const float* inputBuffer,
+ unsigned int num_points)
{
- float returnValue = 0;
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- const float* aPtr = inputBuffer;
- __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
-
- __m256 accumulator = _mm256_setzero_ps();
- __m256 aVal = _mm256_setzero_ps();
-
- for(;number < eighthPoints; number++){
- aVal = _mm256_loadu_ps(aPtr);
- accumulator = _mm256_add_ps(accumulator, aVal);
- aPtr += 8;
- }
-
- _mm256_store_ps(tempBuffer, accumulator);
-
- returnValue = tempBuffer[0];
- returnValue += tempBuffer[1];
- returnValue += tempBuffer[2];
- returnValue += tempBuffer[3];
- returnValue += tempBuffer[4];
- returnValue += tempBuffer[5];
- returnValue += tempBuffer[6];
- returnValue += tempBuffer[7];
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- returnValue += (*aPtr++);
- }
- *result = returnValue;
+ float returnValue = 0;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* aPtr = inputBuffer;
+ __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
+
+ __m256 accumulator = _mm256_setzero_ps();
+ __m256 aVal = _mm256_setzero_ps();
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_loadu_ps(aPtr);
+ accumulator = _mm256_add_ps(accumulator, aVal);
+ aPtr += 8;
+ }
+
+ _mm256_store_ps(tempBuffer, accumulator);
+
+ returnValue = tempBuffer[0];
+ returnValue += tempBuffer[1];
+ returnValue += tempBuffer[2];
+ returnValue += tempBuffer[3];
+ returnValue += tempBuffer[4];
+ returnValue += tempBuffer[5];
+ returnValue += tempBuffer[6];
+ returnValue += tempBuffer[7];
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ returnValue += (*aPtr++);
+ }
+ *result = returnValue;
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigned int num_points)
+static inline void volk_32f_accumulator_s32f_a_sse(float* result,
+ const float* inputBuffer,
+ unsigned int num_points)
{
- float returnValue = 0;
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- const float* aPtr = inputBuffer;
- __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
-
- __m128 accumulator = _mm_setzero_ps();
- __m128 aVal = _mm_setzero_ps();
-
- for(;number < quarterPoints; number++){
- aVal = _mm_load_ps(aPtr);
- accumulator = _mm_add_ps(accumulator, aVal);
- aPtr += 4;
- }
-
- _mm_store_ps(tempBuffer,accumulator);
-
- returnValue = tempBuffer[0];
- returnValue += tempBuffer[1];
- returnValue += tempBuffer[2];
- returnValue += tempBuffer[3];
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- returnValue += (*aPtr++);
- }
- *result = returnValue;
+ float returnValue = 0;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* aPtr = inputBuffer;
+ __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
+
+ __m128 accumulator = _mm_setzero_ps();
+ __m128 aVal = _mm_setzero_ps();
+
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr);
+ accumulator = _mm_add_ps(accumulator, aVal);
+ aPtr += 4;
+ }
+
+ _mm_store_ps(tempBuffer, accumulator);
+
+ returnValue = tempBuffer[0];
+ returnValue += tempBuffer[1];
+ returnValue += tempBuffer[2];
+ returnValue += tempBuffer[3];
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ returnValue += (*aPtr++);
+ }
+ *result = returnValue;
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32f_accumulator_s32f_u_sse(float* result, const float* inputBuffer, unsigned int num_points)
+static inline void volk_32f_accumulator_s32f_u_sse(float* result,
+ const float* inputBuffer,
+ unsigned int num_points)
{
- float returnValue = 0;
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- const float* aPtr = inputBuffer;
- __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
-
- __m128 accumulator = _mm_setzero_ps();
- __m128 aVal = _mm_setzero_ps();
-
- for(;number < quarterPoints; number++){
- aVal = _mm_load_ps(aPtr);
- accumulator = _mm_add_ps(accumulator, aVal);
- aPtr += 4;
- }
-
- _mm_store_ps(tempBuffer,accumulator);
-
- returnValue = tempBuffer[0];
- returnValue += tempBuffer[1];
- returnValue += tempBuffer[2];
- returnValue += tempBuffer[3];
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- returnValue += (*aPtr++);
- }
- *result = returnValue;
+ float returnValue = 0;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* aPtr = inputBuffer;
+ __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
+
+ __m128 accumulator = _mm_setzero_ps();
+ __m128 aVal = _mm_setzero_ps();
+
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr);
+ accumulator = _mm_add_ps(accumulator, aVal);
+ aPtr += 4;
+ }
+
+ _mm_store_ps(tempBuffer, accumulator);
+
+ returnValue = tempBuffer[0];
+ returnValue += tempBuffer[1];
+ returnValue += tempBuffer[2];
+ returnValue += tempBuffer[3];
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ returnValue += (*aPtr++);
+ }
+ *result = returnValue;
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points)
+static inline void volk_32f_accumulator_s32f_generic(float* result,
+ const float* inputBuffer,
+ unsigned int num_points)
{
- const float* aPtr = inputBuffer;
- unsigned int number = 0;
- float returnValue = 0;
-
- for(;number < num_points; number++){
- returnValue += (*aPtr++);
- }
- *result = returnValue;
+ const float* aPtr = inputBuffer;
+ unsigned int number = 0;
+ float returnValue = 0;
+
+ for (; number < num_points; number++) {
+ returnValue += (*aPtr++);
+ }
+ *result = returnValue;
}
#endif /* LV_HAVE_GENERIC */
* \endcode
*/
-#include <stdio.h>
-#include <math.h>
#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
-/* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/
+/* This is the number of terms of Taylor series to evaluate, increase this for more
+ * accuracy*/
#define ACOS_TERMS 2
#ifndef INCLUDED_volk_32f_acos_32f_a_H
#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include <immintrin.h>
-static inline void
-volk_32f_acos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_acos_32f_a_avx2_fma(float* bVector,
+ const float* aVector,
+ unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int eighthPoints = num_points / 8;
- int i, j;
-
- __m256 aVal, d, pi, pio2, x, y, z, arccosine;
- __m256 fzeroes, fones, ftwos, ffours, condition;
-
- pi = _mm256_set1_ps(3.14159265358979323846);
- pio2 = _mm256_set1_ps(3.14159265358979323846/2);
- fzeroes = _mm256_setzero_ps();
- fones = _mm256_set1_ps(1.0);
- ftwos = _mm256_set1_ps(2.0);
- ffours = _mm256_set1_ps(4.0);
-
- for(;number < eighthPoints; number++){
- aVal = _mm256_load_ps(aPtr);
- d = aVal;
- aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
- z = aVal;
- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
- for(i = 0; i < 2; i++)
- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x,fones)));
- x = _mm256_div_ps(fones, x);
- y = fzeroes;
- for(j = ACOS_TERMS - 1; j >=0 ; j--)
- y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
-
- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
-
- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
- arccosine = y;
- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
- arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
- condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
- arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
-
- _mm256_store_ps(bPtr, arccosine);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = acos(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int eighthPoints = num_points / 8;
+ int i, j;
+
+ __m256 aVal, d, pi, pio2, x, y, z, arccosine;
+ __m256 fzeroes, fones, ftwos, ffours, condition;
+
+ pi = _mm256_set1_ps(3.14159265358979323846);
+ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+ fzeroes = _mm256_setzero_ps();
+ fones = _mm256_set1_ps(1.0);
+ ftwos = _mm256_set1_ps(2.0);
+ ffours = _mm256_set1_ps(4.0);
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_load_ps(aPtr);
+ d = aVal;
+ aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
+ _mm256_sub_ps(fones, aVal))),
+ aVal);
+ z = aVal;
+ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+ x = _mm256_add_ps(
+ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+ for (i = 0; i < 2; i++)
+ x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
+ x = _mm256_div_ps(fones, x);
+ y = fzeroes;
+ for (j = ACOS_TERMS - 1; j >= 0; j--)
+ y = _mm256_fmadd_ps(
+ y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+
+ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+ y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
+ arccosine = y;
+ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+ arccosine = _mm256_sub_ps(
+ arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
+ condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
+ arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
+
+ _mm256_store_ps(bPtr, arccosine);
+ aPtr += 8;
+ bPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = acos(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
static inline void
volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int eighthPoints = num_points / 8;
- int i, j;
-
- __m256 aVal, d, pi, pio2, x, y, z, arccosine;
- __m256 fzeroes, fones, ftwos, ffours, condition;
-
- pi = _mm256_set1_ps(3.14159265358979323846);
- pio2 = _mm256_set1_ps(3.14159265358979323846/2);
- fzeroes = _mm256_setzero_ps();
- fones = _mm256_set1_ps(1.0);
- ftwos = _mm256_set1_ps(2.0);
- ffours = _mm256_set1_ps(4.0);
-
- for(;number < eighthPoints; number++){
- aVal = _mm256_load_ps(aPtr);
- d = aVal;
- aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
- z = aVal;
- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
- for(i = 0; i < 2; i++)
- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
- x = _mm256_div_ps(fones, x);
- y = fzeroes;
- for(j = ACOS_TERMS - 1; j >=0 ; j--)
- y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
-
- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
-
- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
- arccosine = y;
- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
- arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
- condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
- arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
-
- _mm256_store_ps(bPtr, arccosine);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = acos(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int eighthPoints = num_points / 8;
+ int i, j;
+
+ __m256 aVal, d, pi, pio2, x, y, z, arccosine;
+ __m256 fzeroes, fones, ftwos, ffours, condition;
+
+ pi = _mm256_set1_ps(3.14159265358979323846);
+ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+ fzeroes = _mm256_setzero_ps();
+ fones = _mm256_set1_ps(1.0);
+ ftwos = _mm256_set1_ps(2.0);
+ ffours = _mm256_set1_ps(4.0);
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_load_ps(aPtr);
+ d = aVal;
+ aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
+ _mm256_sub_ps(fones, aVal))),
+ aVal);
+ z = aVal;
+ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+ x = _mm256_add_ps(
+ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+ for (i = 0; i < 2; i++)
+ x = _mm256_add_ps(x,
+ _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+ x = _mm256_div_ps(fones, x);
+ y = fzeroes;
+ for (j = ACOS_TERMS - 1; j >= 0; j--)
+ y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
+ _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+
+ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+ y = _mm256_add_ps(
+ y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
+ arccosine = y;
+ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+ arccosine = _mm256_sub_ps(
+ arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
+ condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
+ arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
+
+ _mm256_store_ps(bPtr, arccosine);
+ aPtr += 8;
+ bPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = acos(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX2 for aligned */
static inline void
volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int quarterPoints = num_points / 4;
- int i, j;
-
- __m128 aVal, d, pi, pio2, x, y, z, arccosine;
- __m128 fzeroes, fones, ftwos, ffours, condition;
-
- pi = _mm_set1_ps(3.14159265358979323846);
- pio2 = _mm_set1_ps(3.14159265358979323846/2);
- fzeroes = _mm_setzero_ps();
- fones = _mm_set1_ps(1.0);
- ftwos = _mm_set1_ps(2.0);
- ffours = _mm_set1_ps(4.0);
-
- for(;number < quarterPoints; number++){
- aVal = _mm_load_ps(aPtr);
- d = aVal;
- aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
- z = aVal;
- condition = _mm_cmplt_ps(z, fzeroes);
- z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
- condition = _mm_cmplt_ps(z, fones);
- x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
-
- for(i = 0; i < 2; i++)
- x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
- x = _mm_div_ps(fones, x);
- y = fzeroes;
- for(j = ACOS_TERMS - 1; j >=0 ; j--)
- y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
-
- y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
- condition = _mm_cmpgt_ps(z, fones);
-
- y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
- arccosine = y;
- condition = _mm_cmplt_ps(aVal, fzeroes);
- arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
- condition = _mm_cmplt_ps(d, fzeroes);
- arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
-
- _mm_store_ps(bPtr, arccosine);
- aPtr += 4;
- bPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = acosf(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ int i, j;
+
+ __m128 aVal, d, pi, pio2, x, y, z, arccosine;
+ __m128 fzeroes, fones, ftwos, ffours, condition;
+
+ pi = _mm_set1_ps(3.14159265358979323846);
+ pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
+ fzeroes = _mm_setzero_ps();
+ fones = _mm_set1_ps(1.0);
+ ftwos = _mm_set1_ps(2.0);
+ ffours = _mm_set1_ps(4.0);
+
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr);
+ d = aVal;
+ aVal = _mm_div_ps(
+ _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
+ aVal);
+ z = aVal;
+ condition = _mm_cmplt_ps(z, fzeroes);
+ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+ condition = _mm_cmplt_ps(z, fones);
+ x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+ for (i = 0; i < 2; i++)
+ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+ x = _mm_div_ps(fones, x);
+ y = fzeroes;
+ for (j = ACOS_TERMS - 1; j >= 0; j--)
+ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
+ _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
+
+ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+ condition = _mm_cmpgt_ps(z, fones);
+
+ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+ arccosine = y;
+ condition = _mm_cmplt_ps(aVal, fzeroes);
+ arccosine =
+ _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
+ condition = _mm_cmplt_ps(d, fzeroes);
+ arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
+
+ _mm_store_ps(bPtr, arccosine);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *bPtr++ = acosf(*aPtr++);
+ }
}
#endif /* LV_HAVE_SSE4_1 for aligned */
#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include <immintrin.h>
-static inline void
-volk_32f_acos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_acos_32f_u_avx2_fma(float* bVector,
+ const float* aVector,
+ unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int eighthPoints = num_points / 8;
- int i, j;
-
- __m256 aVal, d, pi, pio2, x, y, z, arccosine;
- __m256 fzeroes, fones, ftwos, ffours, condition;
-
- pi = _mm256_set1_ps(3.14159265358979323846);
- pio2 = _mm256_set1_ps(3.14159265358979323846/2);
- fzeroes = _mm256_setzero_ps();
- fones = _mm256_set1_ps(1.0);
- ftwos = _mm256_set1_ps(2.0);
- ffours = _mm256_set1_ps(4.0);
-
- for(;number < eighthPoints; number++){
- aVal = _mm256_loadu_ps(aPtr);
- d = aVal;
- aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
- z = aVal;
- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
- for(i = 0; i < 2; i++)
- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x,fones)));
- x = _mm256_div_ps(fones, x);
- y = fzeroes;
- for(j = ACOS_TERMS - 1; j >=0 ; j--)
- y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
-
- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
-
- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
- arccosine = y;
- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
- arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
- condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
- arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
-
- _mm256_storeu_ps(bPtr, arccosine);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = acos(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int eighthPoints = num_points / 8;
+ int i, j;
+
+ __m256 aVal, d, pi, pio2, x, y, z, arccosine;
+ __m256 fzeroes, fones, ftwos, ffours, condition;
+
+ pi = _mm256_set1_ps(3.14159265358979323846);
+ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+ fzeroes = _mm256_setzero_ps();
+ fones = _mm256_set1_ps(1.0);
+ ftwos = _mm256_set1_ps(2.0);
+ ffours = _mm256_set1_ps(4.0);
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_loadu_ps(aPtr);
+ d = aVal;
+ aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
+ _mm256_sub_ps(fones, aVal))),
+ aVal);
+ z = aVal;
+ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+ x = _mm256_add_ps(
+ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+ for (i = 0; i < 2; i++)
+ x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
+ x = _mm256_div_ps(fones, x);
+ y = fzeroes;
+ for (j = ACOS_TERMS - 1; j >= 0; j--)
+ y = _mm256_fmadd_ps(
+ y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+
+ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+ y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
+ arccosine = y;
+ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+ arccosine = _mm256_sub_ps(
+ arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
+ condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
+ arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
+
+ _mm256_storeu_ps(bPtr, arccosine);
+ aPtr += 8;
+ bPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = acos(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
static inline void
volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int eighthPoints = num_points / 8;
- int i, j;
-
- __m256 aVal, d, pi, pio2, x, y, z, arccosine;
- __m256 fzeroes, fones, ftwos, ffours, condition;
-
- pi = _mm256_set1_ps(3.14159265358979323846);
- pio2 = _mm256_set1_ps(3.14159265358979323846/2);
- fzeroes = _mm256_setzero_ps();
- fones = _mm256_set1_ps(1.0);
- ftwos = _mm256_set1_ps(2.0);
- ffours = _mm256_set1_ps(4.0);
-
- for(;number < eighthPoints; number++){
- aVal = _mm256_loadu_ps(aPtr);
- d = aVal;
- aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
- z = aVal;
- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
- for(i = 0; i < 2; i++)
- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
- x = _mm256_div_ps(fones, x);
- y = fzeroes;
- for(j = ACOS_TERMS - 1; j >=0 ; j--)
- y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
-
- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
-
- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
- arccosine = y;
- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
- arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
- condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
- arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
-
- _mm256_storeu_ps(bPtr, arccosine);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = acos(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int eighthPoints = num_points / 8;
+ int i, j;
+
+ __m256 aVal, d, pi, pio2, x, y, z, arccosine;
+ __m256 fzeroes, fones, ftwos, ffours, condition;
+
+ pi = _mm256_set1_ps(3.14159265358979323846);
+ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+ fzeroes = _mm256_setzero_ps();
+ fones = _mm256_set1_ps(1.0);
+ ftwos = _mm256_set1_ps(2.0);
+ ffours = _mm256_set1_ps(4.0);
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_loadu_ps(aPtr);
+ d = aVal;
+ aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
+ _mm256_sub_ps(fones, aVal))),
+ aVal);
+ z = aVal;
+ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+ x = _mm256_add_ps(
+ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+ for (i = 0; i < 2; i++)
+ x = _mm256_add_ps(x,
+ _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+ x = _mm256_div_ps(fones, x);
+ y = fzeroes;
+ for (j = ACOS_TERMS - 1; j >= 0; j--)
+ y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
+ _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+
+ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+ y = _mm256_add_ps(
+ y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
+ arccosine = y;
+ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+ arccosine = _mm256_sub_ps(
+ arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
+ condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
+ arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
+
+ _mm256_storeu_ps(bPtr, arccosine);
+ aPtr += 8;
+ bPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = acos(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX2 for unaligned */
static inline void
volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int quarterPoints = num_points / 4;
- int i, j;
-
- __m128 aVal, d, pi, pio2, x, y, z, arccosine;
- __m128 fzeroes, fones, ftwos, ffours, condition;
-
- pi = _mm_set1_ps(3.14159265358979323846);
- pio2 = _mm_set1_ps(3.14159265358979323846/2);
- fzeroes = _mm_setzero_ps();
- fones = _mm_set1_ps(1.0);
- ftwos = _mm_set1_ps(2.0);
- ffours = _mm_set1_ps(4.0);
-
- for(;number < quarterPoints; number++){
- aVal = _mm_loadu_ps(aPtr);
- d = aVal;
- aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
- z = aVal;
- condition = _mm_cmplt_ps(z, fzeroes);
- z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
- condition = _mm_cmplt_ps(z, fones);
- x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
-
- for(i = 0; i < 2; i++)
- x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
- x = _mm_div_ps(fones, x);
- y = fzeroes;
-
- for(j = ACOS_TERMS - 1; j >=0 ; j--)
- y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
-
- y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
- condition = _mm_cmpgt_ps(z, fones);
-
- y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
- arccosine = y;
- condition = _mm_cmplt_ps(aVal, fzeroes);
- arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
- condition = _mm_cmplt_ps(d, fzeroes);
- arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
-
- _mm_storeu_ps(bPtr, arccosine);
- aPtr += 4;
- bPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = acosf(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ int i, j;
+
+ __m128 aVal, d, pi, pio2, x, y, z, arccosine;
+ __m128 fzeroes, fones, ftwos, ffours, condition;
+
+ pi = _mm_set1_ps(3.14159265358979323846);
+ pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
+ fzeroes = _mm_setzero_ps();
+ fones = _mm_set1_ps(1.0);
+ ftwos = _mm_set1_ps(2.0);
+ ffours = _mm_set1_ps(4.0);
+
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_loadu_ps(aPtr);
+ d = aVal;
+ aVal = _mm_div_ps(
+ _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
+ aVal);
+ z = aVal;
+ condition = _mm_cmplt_ps(z, fzeroes);
+ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+ condition = _mm_cmplt_ps(z, fones);
+ x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+ for (i = 0; i < 2; i++)
+ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+ x = _mm_div_ps(fones, x);
+ y = fzeroes;
+
+ for (j = ACOS_TERMS - 1; j >= 0; j--)
+ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
+ _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
+
+ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+ condition = _mm_cmpgt_ps(z, fones);
+
+ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+ arccosine = y;
+ condition = _mm_cmplt_ps(aVal, fzeroes);
+ arccosine =
+ _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
+ condition = _mm_cmplt_ps(d, fzeroes);
+ arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
+
+ _mm_storeu_ps(bPtr, arccosine);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *bPtr++ = acosf(*aPtr++);
+ }
}
#endif /* LV_HAVE_SSE4_1 for aligned */
static inline void
volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *bPtr++ = acosf(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+ for (number = 0; number < num_points; number++) {
+ *bPtr++ = acosf(*aPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
* \endcode
*/
-#include <stdio.h>
-#include <math.h>
#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
-/* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/
+/* This is the number of terms of Taylor series to evaluate, increase this for more
+ * accuracy*/
#define ASIN_TERMS 2
#ifndef INCLUDED_volk_32f_asin_32f_a_H
#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include <immintrin.h>
-static inline void
-volk_32f_asin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_asin_32f_a_avx2_fma(float* bVector,
+ const float* aVector,
+ unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int eighthPoints = num_points / 8;
- int i, j;
-
- __m256 aVal, pio2, x, y, z, arcsine;
- __m256 fzeroes, fones, ftwos, ffours, condition;
-
- pio2 = _mm256_set1_ps(3.14159265358979323846/2);
- fzeroes = _mm256_setzero_ps();
- fones = _mm256_set1_ps(1.0);
- ftwos = _mm256_set1_ps(2.0);
- ffours = _mm256_set1_ps(4.0);
-
- for(;number < eighthPoints; number++){
- aVal = _mm256_load_ps(aPtr);
- aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))));
- z = aVal;
- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
- for(i = 0; i < 2; i++){
- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones)));
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int eighthPoints = num_points / 8;
+ int i, j;
+
+ __m256 aVal, pio2, x, y, z, arcsine;
+ __m256 fzeroes, fones, ftwos, ffours, condition;
+
+ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+ fzeroes = _mm256_setzero_ps();
+ fones = _mm256_set1_ps(1.0);
+ ftwos = _mm256_set1_ps(2.0);
+ ffours = _mm256_set1_ps(4.0);
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_load_ps(aPtr);
+ aVal = _mm256_div_ps(aVal,
+ _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
+ _mm256_sub_ps(fones, aVal))));
+ z = aVal;
+ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+ x = _mm256_add_ps(
+ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+ for (i = 0; i < 2; i++) {
+ x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
+ }
+ x = _mm256_div_ps(fones, x);
+ y = fzeroes;
+ for (j = ASIN_TERMS - 1; j >= 0; j--) {
+ y = _mm256_fmadd_ps(
+ y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+ }
+
+ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+ y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
+ arcsine = y;
+ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+ arcsine = _mm256_sub_ps(arcsine,
+ _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
+
+ _mm256_store_ps(bPtr, arcsine);
+ aPtr += 8;
+ bPtr += 8;
}
- x = _mm256_div_ps(fones, x);
- y = fzeroes;
- for(j = ASIN_TERMS - 1; j >=0 ; j--){
- y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
- }
-
- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
- condition = _mm256_cmp_ps(z, fones,_CMP_GT_OS);
-
- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition));
- arcsine = y;
- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
- arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
- _mm256_store_ps(bPtr, arcsine);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = asin(*aPtr++);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = asin(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
static inline void
volk_32f_asin_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int eighthPoints = num_points / 8;
- int i, j;
-
- __m256 aVal, pio2, x, y, z, arcsine;
- __m256 fzeroes, fones, ftwos, ffours, condition;
-
- pio2 = _mm256_set1_ps(3.14159265358979323846/2);
- fzeroes = _mm256_setzero_ps();
- fones = _mm256_set1_ps(1.0);
- ftwos = _mm256_set1_ps(2.0);
- ffours = _mm256_set1_ps(4.0);
-
- for(;number < eighthPoints; number++){
- aVal = _mm256_load_ps(aPtr);
- aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))));
- z = aVal;
- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
- for(i = 0; i < 2; i++){
- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int eighthPoints = num_points / 8;
+ int i, j;
+
+ __m256 aVal, pio2, x, y, z, arcsine;
+ __m256 fzeroes, fones, ftwos, ffours, condition;
+
+ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+ fzeroes = _mm256_setzero_ps();
+ fones = _mm256_set1_ps(1.0);
+ ftwos = _mm256_set1_ps(2.0);
+ ffours = _mm256_set1_ps(4.0);
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_load_ps(aPtr);
+ aVal = _mm256_div_ps(aVal,
+ _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
+ _mm256_sub_ps(fones, aVal))));
+ z = aVal;
+ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+ x = _mm256_add_ps(
+ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+ for (i = 0; i < 2; i++) {
+ x = _mm256_add_ps(x,
+ _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+ }
+ x = _mm256_div_ps(fones, x);
+ y = fzeroes;
+ for (j = ASIN_TERMS - 1; j >= 0; j--) {
+ y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
+ _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+ }
+
+ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+ y = _mm256_add_ps(
+ y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
+ arcsine = y;
+ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+ arcsine = _mm256_sub_ps(arcsine,
+ _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
+
+ _mm256_store_ps(bPtr, arcsine);
+ aPtr += 8;
+ bPtr += 8;
}
- x = _mm256_div_ps(fones, x);
- y = fzeroes;
- for(j = ASIN_TERMS - 1; j >=0 ; j--){
- y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
- }
-
- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
-
- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
- arcsine = y;
- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
- arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
- _mm256_store_ps(bPtr, arcsine);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = asin(*aPtr++);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = asin(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX for aligned */
static inline void
volk_32f_asin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int quarterPoints = num_points / 4;
- int i, j;
-
- __m128 aVal, pio2, x, y, z, arcsine;
- __m128 fzeroes, fones, ftwos, ffours, condition;
-
- pio2 = _mm_set1_ps(3.14159265358979323846/2);
- fzeroes = _mm_setzero_ps();
- fones = _mm_set1_ps(1.0);
- ftwos = _mm_set1_ps(2.0);
- ffours = _mm_set1_ps(4.0);
-
- for(;number < quarterPoints; number++){
- aVal = _mm_load_ps(aPtr);
- aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
- z = aVal;
- condition = _mm_cmplt_ps(z, fzeroes);
- z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
- condition = _mm_cmplt_ps(z, fones);
- x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
-
- for(i = 0; i < 2; i++){
- x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ int i, j;
+
+ __m128 aVal, pio2, x, y, z, arcsine;
+ __m128 fzeroes, fones, ftwos, ffours, condition;
+
+ pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
+ fzeroes = _mm_setzero_ps();
+ fones = _mm_set1_ps(1.0);
+ ftwos = _mm_set1_ps(2.0);
+ ffours = _mm_set1_ps(4.0);
+
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr);
+ aVal = _mm_div_ps(
+ aVal,
+ _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
+ z = aVal;
+ condition = _mm_cmplt_ps(z, fzeroes);
+ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+ condition = _mm_cmplt_ps(z, fones);
+ x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+ for (i = 0; i < 2; i++) {
+ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+ }
+ x = _mm_div_ps(fones, x);
+ y = fzeroes;
+ for (j = ASIN_TERMS - 1; j >= 0; j--) {
+ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
+ _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
+ }
+
+ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+ condition = _mm_cmpgt_ps(z, fones);
+
+ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+ arcsine = y;
+ condition = _mm_cmplt_ps(aVal, fzeroes);
+ arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
+
+ _mm_store_ps(bPtr, arcsine);
+ aPtr += 4;
+ bPtr += 4;
}
- x = _mm_div_ps(fones, x);
- y = fzeroes;
- for(j = ASIN_TERMS - 1; j >=0 ; j--){
- y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
- }
-
- y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
- condition = _mm_cmpgt_ps(z, fones);
-
- y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
- arcsine = y;
- condition = _mm_cmplt_ps(aVal, fzeroes);
- arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
-
- _mm_store_ps(bPtr, arcsine);
- aPtr += 4;
- bPtr += 4;
- }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = asinf(*aPtr++);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *bPtr++ = asinf(*aPtr++);
+ }
}
#endif /* LV_HAVE_SSE4_1 for aligned */
#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include <immintrin.h>
-static inline void
-volk_32f_asin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_asin_32f_u_avx2_fma(float* bVector,
+ const float* aVector,
+ unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int eighthPoints = num_points / 8;
- int i, j;
-
- __m256 aVal, pio2, x, y, z, arcsine;
- __m256 fzeroes, fones, ftwos, ffours, condition;
-
- pio2 = _mm256_set1_ps(3.14159265358979323846/2);
- fzeroes = _mm256_setzero_ps();
- fones = _mm256_set1_ps(1.0);
- ftwos = _mm256_set1_ps(2.0);
- ffours = _mm256_set1_ps(4.0);
-
- for(;number < eighthPoints; number++){
- aVal = _mm256_loadu_ps(aPtr);
- aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))));
- z = aVal;
- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
- for(i = 0; i < 2; i++){
- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones)));
- }
- x = _mm256_div_ps(fones, x);
- y = fzeroes;
- for(j = ASIN_TERMS - 1; j >=0 ; j--){
- y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int eighthPoints = num_points / 8;
+ int i, j;
+
+ __m256 aVal, pio2, x, y, z, arcsine;
+ __m256 fzeroes, fones, ftwos, ffours, condition;
+
+ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+ fzeroes = _mm256_setzero_ps();
+ fones = _mm256_set1_ps(1.0);
+ ftwos = _mm256_set1_ps(2.0);
+ ffours = _mm256_set1_ps(4.0);
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_loadu_ps(aPtr);
+ aVal = _mm256_div_ps(aVal,
+ _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
+ _mm256_sub_ps(fones, aVal))));
+ z = aVal;
+ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+ x = _mm256_add_ps(
+ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+ for (i = 0; i < 2; i++) {
+ x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
+ }
+ x = _mm256_div_ps(fones, x);
+ y = fzeroes;
+ for (j = ASIN_TERMS - 1; j >= 0; j--) {
+ y = _mm256_fmadd_ps(
+ y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+ }
+
+ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+ y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
+ arcsine = y;
+ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+ arcsine = _mm256_sub_ps(arcsine,
+ _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
+
+ _mm256_storeu_ps(bPtr, arcsine);
+ aPtr += 8;
+ bPtr += 8;
}
- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
-
- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition));
- arcsine = y;
- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
- arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
-
- _mm256_storeu_ps(bPtr, arcsine);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = asin(*aPtr++);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = asin(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
static inline void
volk_32f_asin_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int eighthPoints = num_points / 8;
- int i, j;
-
- __m256 aVal, pio2, x, y, z, arcsine;
- __m256 fzeroes, fones, ftwos, ffours, condition;
-
- pio2 = _mm256_set1_ps(3.14159265358979323846/2);
- fzeroes = _mm256_setzero_ps();
- fones = _mm256_set1_ps(1.0);
- ftwos = _mm256_set1_ps(2.0);
- ffours = _mm256_set1_ps(4.0);
-
- for(;number < eighthPoints; number++){
- aVal = _mm256_loadu_ps(aPtr);
- aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))));
- z = aVal;
- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
- for(i = 0; i < 2; i++){
- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int eighthPoints = num_points / 8;
+ int i, j;
+
+ __m256 aVal, pio2, x, y, z, arcsine;
+ __m256 fzeroes, fones, ftwos, ffours, condition;
+
+ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+ fzeroes = _mm256_setzero_ps();
+ fones = _mm256_set1_ps(1.0);
+ ftwos = _mm256_set1_ps(2.0);
+ ffours = _mm256_set1_ps(4.0);
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_loadu_ps(aPtr);
+ aVal = _mm256_div_ps(aVal,
+ _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
+ _mm256_sub_ps(fones, aVal))));
+ z = aVal;
+ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+ x = _mm256_add_ps(
+ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+ for (i = 0; i < 2; i++) {
+ x = _mm256_add_ps(x,
+ _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+ }
+ x = _mm256_div_ps(fones, x);
+ y = fzeroes;
+ for (j = ASIN_TERMS - 1; j >= 0; j--) {
+ y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
+ _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+ }
+
+ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+ y = _mm256_add_ps(
+ y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
+ arcsine = y;
+ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+ arcsine = _mm256_sub_ps(arcsine,
+ _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
+
+ _mm256_storeu_ps(bPtr, arcsine);
+ aPtr += 8;
+ bPtr += 8;
}
- x = _mm256_div_ps(fones, x);
- y = fzeroes;
- for(j = ASIN_TERMS - 1; j >=0 ; j--){
- y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
- }
-
- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
- arcsine = y;
- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
- arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
-
- _mm256_storeu_ps(bPtr, arcsine);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = asin(*aPtr++);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = asin(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX for unaligned */
static inline void
volk_32f_asin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int quarterPoints = num_points / 4;
- int i, j;
-
- __m128 aVal, pio2, x, y, z, arcsine;
- __m128 fzeroes, fones, ftwos, ffours, condition;
-
- pio2 = _mm_set1_ps(3.14159265358979323846/2);
- fzeroes = _mm_setzero_ps();
- fones = _mm_set1_ps(1.0);
- ftwos = _mm_set1_ps(2.0);
- ffours = _mm_set1_ps(4.0);
-
- for(;number < quarterPoints; number++){
- aVal = _mm_loadu_ps(aPtr);
- aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
- z = aVal;
- condition = _mm_cmplt_ps(z, fzeroes);
- z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
- condition = _mm_cmplt_ps(z, fones);
- x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
-
- for(i = 0; i < 2; i++){
- x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ int i, j;
+
+ __m128 aVal, pio2, x, y, z, arcsine;
+ __m128 fzeroes, fones, ftwos, ffours, condition;
+
+ pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
+ fzeroes = _mm_setzero_ps();
+ fones = _mm_set1_ps(1.0);
+ ftwos = _mm_set1_ps(2.0);
+ ffours = _mm_set1_ps(4.0);
+
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_loadu_ps(aPtr);
+ aVal = _mm_div_ps(
+ aVal,
+ _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
+ z = aVal;
+ condition = _mm_cmplt_ps(z, fzeroes);
+ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+ condition = _mm_cmplt_ps(z, fones);
+ x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+ for (i = 0; i < 2; i++) {
+ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+ }
+ x = _mm_div_ps(fones, x);
+ y = fzeroes;
+ for (j = ASIN_TERMS - 1; j >= 0; j--) {
+ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
+ _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
+ }
+
+ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+ condition = _mm_cmpgt_ps(z, fones);
+
+ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+ arcsine = y;
+ condition = _mm_cmplt_ps(aVal, fzeroes);
+ arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
+
+ _mm_storeu_ps(bPtr, arcsine);
+ aPtr += 4;
+ bPtr += 4;
}
- x = _mm_div_ps(fones, x);
- y = fzeroes;
- for(j = ASIN_TERMS - 1; j >=0 ; j--){
- y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
- }
-
- y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
- condition = _mm_cmpgt_ps(z, fones);
- y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
- arcsine = y;
- condition = _mm_cmplt_ps(aVal, fzeroes);
- arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
-
- _mm_storeu_ps(bPtr, arcsine);
- aPtr += 4;
- bPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = asinf(*aPtr++);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *bPtr++ = asinf(*aPtr++);
+ }
}
#endif /* LV_HAVE_SSE4_1 for unaligned */
static inline void
volk_32f_asin_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- *bPtr++ = asinf(*aPtr++);
- }
+ for (number = 0; number < num_points; number++) {
+ *bPtr++ = asinf(*aPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
* \endcode
*/
-#include <stdio.h>
-#include <math.h>
#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
-/* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/
+/* This is the number of terms of Taylor series to evaluate, increase this for more
+ * accuracy*/
#define TERMS 2
#ifndef INCLUDED_volk_32f_atan_32f_a_H
#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include <immintrin.h>
-static inline void
-volk_32f_atan_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_atan_32f_a_avx2_fma(float* bVector,
+ const float* aVector,
+ unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int eighthPoints = num_points / 8;
- int i, j;
-
- __m256 aVal, pio2, x, y, z, arctangent;
- __m256 fzeroes, fones, ftwos, ffours, condition;
-
- pio2 = _mm256_set1_ps(3.14159265358979323846/2);
- fzeroes = _mm256_setzero_ps();
- fones = _mm256_set1_ps(1.0);
- ftwos = _mm256_set1_ps(2.0);
- ffours = _mm256_set1_ps(4.0);
-
- for(;number < eighthPoints; number++){
- aVal = _mm256_load_ps(aPtr);
- z = aVal;
- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
- for(i = 0; i < 2; i++){
- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones)));
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int eighthPoints = num_points / 8;
+ int i, j;
+
+ __m256 aVal, pio2, x, y, z, arctangent;
+ __m256 fzeroes, fones, ftwos, ffours, condition;
+
+ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+ fzeroes = _mm256_setzero_ps();
+ fones = _mm256_set1_ps(1.0);
+ ftwos = _mm256_set1_ps(2.0);
+ ffours = _mm256_set1_ps(4.0);
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_load_ps(aPtr);
+ z = aVal;
+ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+ x = _mm256_add_ps(
+ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+ for (i = 0; i < 2; i++) {
+ x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
+ }
+ x = _mm256_div_ps(fones, x);
+ y = fzeroes;
+ for (j = TERMS - 1; j >= 0; j--) {
+ y = _mm256_fmadd_ps(
+ y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+ }
+
+ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+ y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
+ arctangent = y;
+ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+ arctangent = _mm256_sub_ps(
+ arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
+
+ _mm256_store_ps(bPtr, arctangent);
+ aPtr += 8;
+ bPtr += 8;
}
- x = _mm256_div_ps(fones, x);
- y = fzeroes;
- for(j = TERMS - 1; j >=0 ; j--){
- y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
- }
-
- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
-
- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition));
- arctangent = y;
- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
- arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
-
- _mm256_store_ps(bPtr, arctangent);
- aPtr += 8;
- bPtr += 8;
- }
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = atan(*aPtr++);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = atan(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
static inline void
volk_32f_atan_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int eighthPoints = num_points / 8;
- int i, j;
-
- __m256 aVal, pio2, x, y, z, arctangent;
- __m256 fzeroes, fones, ftwos, ffours, condition;
-
- pio2 = _mm256_set1_ps(3.14159265358979323846/2);
- fzeroes = _mm256_setzero_ps();
- fones = _mm256_set1_ps(1.0);
- ftwos = _mm256_set1_ps(2.0);
- ffours = _mm256_set1_ps(4.0);
-
- for(;number < eighthPoints; number++){
- aVal = _mm256_load_ps(aPtr);
- z = aVal;
- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
- for(i = 0; i < 2; i++){
- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
- }
- x = _mm256_div_ps(fones, x);
- y = fzeroes;
- for(j = TERMS - 1; j >=0 ; j--){
- y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int eighthPoints = num_points / 8;
+ int i, j;
+
+ __m256 aVal, pio2, x, y, z, arctangent;
+ __m256 fzeroes, fones, ftwos, ffours, condition;
+
+ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+ fzeroes = _mm256_setzero_ps();
+ fones = _mm256_set1_ps(1.0);
+ ftwos = _mm256_set1_ps(2.0);
+ ffours = _mm256_set1_ps(4.0);
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_load_ps(aPtr);
+ z = aVal;
+ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+ x = _mm256_add_ps(
+ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+ for (i = 0; i < 2; i++) {
+ x = _mm256_add_ps(x,
+ _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+ }
+ x = _mm256_div_ps(fones, x);
+ y = fzeroes;
+ for (j = TERMS - 1; j >= 0; j--) {
+ y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
+ _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+ }
+
+ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+ y = _mm256_add_ps(
+ y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
+ arctangent = y;
+ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+ arctangent = _mm256_sub_ps(
+ arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
+
+ _mm256_store_ps(bPtr, arctangent);
+ aPtr += 8;
+ bPtr += 8;
}
- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
-
- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
- arctangent = y;
- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
- arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
-
- _mm256_store_ps(bPtr, arctangent);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = atan(*aPtr++);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = atan(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX for aligned */
static inline void
volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int quarterPoints = num_points / 4;
- int i, j;
-
- __m128 aVal, pio2, x, y, z, arctangent;
- __m128 fzeroes, fones, ftwos, ffours, condition;
-
- pio2 = _mm_set1_ps(3.14159265358979323846/2);
- fzeroes = _mm_setzero_ps();
- fones = _mm_set1_ps(1.0);
- ftwos = _mm_set1_ps(2.0);
- ffours = _mm_set1_ps(4.0);
-
- for(;number < quarterPoints; number++){
- aVal = _mm_load_ps(aPtr);
- z = aVal;
- condition = _mm_cmplt_ps(z, fzeroes);
- z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
- condition = _mm_cmplt_ps(z, fones);
- x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
-
- for(i = 0; i < 2; i++){
- x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
- }
- x = _mm_div_ps(fones, x);
- y = fzeroes;
- for(j = TERMS - 1; j >=0 ; j--){
- y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ int i, j;
+
+ __m128 aVal, pio2, x, y, z, arctangent;
+ __m128 fzeroes, fones, ftwos, ffours, condition;
+
+ pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
+ fzeroes = _mm_setzero_ps();
+ fones = _mm_set1_ps(1.0);
+ ftwos = _mm_set1_ps(2.0);
+ ffours = _mm_set1_ps(4.0);
+
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr);
+ z = aVal;
+ condition = _mm_cmplt_ps(z, fzeroes);
+ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+ condition = _mm_cmplt_ps(z, fones);
+ x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+ for (i = 0; i < 2; i++) {
+ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+ }
+ x = _mm_div_ps(fones, x);
+ y = fzeroes;
+ for (j = TERMS - 1; j >= 0; j--) {
+ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
+ _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
+ }
+
+ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+ condition = _mm_cmpgt_ps(z, fones);
+
+ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+ arctangent = y;
+ condition = _mm_cmplt_ps(aVal, fzeroes);
+ arctangent =
+ _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
+
+ _mm_store_ps(bPtr, arctangent);
+ aPtr += 4;
+ bPtr += 4;
}
- y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
- condition = _mm_cmpgt_ps(z, fones);
-
- y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
- arctangent = y;
- condition = _mm_cmplt_ps(aVal, fzeroes);
- arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
-
- _mm_store_ps(bPtr, arctangent);
- aPtr += 4;
- bPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = atanf(*aPtr++);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *bPtr++ = atanf(*aPtr++);
+ }
}
#endif /* LV_HAVE_SSE4_1 for aligned */
#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include <immintrin.h>
-static inline void
-volk_32f_atan_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_atan_32f_u_avx2_fma(float* bVector,
+ const float* aVector,
+ unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int eighthPoints = num_points / 8;
- int i, j;
-
- __m256 aVal, pio2, x, y, z, arctangent;
- __m256 fzeroes, fones, ftwos, ffours, condition;
-
- pio2 = _mm256_set1_ps(3.14159265358979323846/2);
- fzeroes = _mm256_setzero_ps();
- fones = _mm256_set1_ps(1.0);
- ftwos = _mm256_set1_ps(2.0);
- ffours = _mm256_set1_ps(4.0);
-
- for(;number < eighthPoints; number++){
- aVal = _mm256_loadu_ps(aPtr);
- z = aVal;
- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
- for(i = 0; i < 2; i++){
- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones)));
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int eighthPoints = num_points / 8;
+ int i, j;
+
+ __m256 aVal, pio2, x, y, z, arctangent;
+ __m256 fzeroes, fones, ftwos, ffours, condition;
+
+ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+ fzeroes = _mm256_setzero_ps();
+ fones = _mm256_set1_ps(1.0);
+ ftwos = _mm256_set1_ps(2.0);
+ ffours = _mm256_set1_ps(4.0);
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_loadu_ps(aPtr);
+ z = aVal;
+ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+ x = _mm256_add_ps(
+ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+ for (i = 0; i < 2; i++) {
+ x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
+ }
+ x = _mm256_div_ps(fones, x);
+ y = fzeroes;
+ for (j = TERMS - 1; j >= 0; j--) {
+ y = _mm256_fmadd_ps(
+ y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+ }
+
+ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+ y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
+ arctangent = y;
+ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+ arctangent = _mm256_sub_ps(
+ arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
+
+ _mm256_storeu_ps(bPtr, arctangent);
+ aPtr += 8;
+ bPtr += 8;
}
- x = _mm256_div_ps(fones, x);
- y = fzeroes;
- for(j = TERMS - 1; j >=0 ; j--){
- y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
- }
-
- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition));
- arctangent = y;
- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
- arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
-
- _mm256_storeu_ps(bPtr, arctangent);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = atan(*aPtr++);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = atan(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
static inline void
volk_32f_atan_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int eighthPoints = num_points / 8;
- int i, j;
-
- __m256 aVal, pio2, x, y, z, arctangent;
- __m256 fzeroes, fones, ftwos, ffours, condition;
-
- pio2 = _mm256_set1_ps(3.14159265358979323846/2);
- fzeroes = _mm256_setzero_ps();
- fones = _mm256_set1_ps(1.0);
- ftwos = _mm256_set1_ps(2.0);
- ffours = _mm256_set1_ps(4.0);
-
- for(;number < eighthPoints; number++){
- aVal = _mm256_loadu_ps(aPtr);
- z = aVal;
- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
- for(i = 0; i < 2; i++){
- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
- }
- x = _mm256_div_ps(fones, x);
- y = fzeroes;
- for(j = TERMS - 1; j >=0 ; j--){
- y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int eighthPoints = num_points / 8;
+ int i, j;
+
+ __m256 aVal, pio2, x, y, z, arctangent;
+ __m256 fzeroes, fones, ftwos, ffours, condition;
+
+ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+ fzeroes = _mm256_setzero_ps();
+ fones = _mm256_set1_ps(1.0);
+ ftwos = _mm256_set1_ps(2.0);
+ ffours = _mm256_set1_ps(4.0);
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_loadu_ps(aPtr);
+ z = aVal;
+ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+ x = _mm256_add_ps(
+ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+ for (i = 0; i < 2; i++) {
+ x = _mm256_add_ps(x,
+ _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+ }
+ x = _mm256_div_ps(fones, x);
+ y = fzeroes;
+ for (j = TERMS - 1; j >= 0; j--) {
+ y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
+ _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+ }
+
+ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+ y = _mm256_add_ps(
+ y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
+ arctangent = y;
+ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+ arctangent = _mm256_sub_ps(
+ arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
+
+ _mm256_storeu_ps(bPtr, arctangent);
+ aPtr += 8;
+ bPtr += 8;
}
- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
-
- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
- arctangent = y;
- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
- arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
-
- _mm256_storeu_ps(bPtr, arctangent);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = atan(*aPtr++);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = atan(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX for unaligned */
static inline void
volk_32f_atan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int quarterPoints = num_points / 4;
- int i, j;
-
- __m128 aVal, pio2, x, y, z, arctangent;
- __m128 fzeroes, fones, ftwos, ffours, condition;
-
- pio2 = _mm_set1_ps(3.14159265358979323846/2);
- fzeroes = _mm_setzero_ps();
- fones = _mm_set1_ps(1.0);
- ftwos = _mm_set1_ps(2.0);
- ffours = _mm_set1_ps(4.0);
-
- for(;number < quarterPoints; number++){
- aVal = _mm_loadu_ps(aPtr);
- z = aVal;
- condition = _mm_cmplt_ps(z, fzeroes);
- z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
- condition = _mm_cmplt_ps(z, fones);
- x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
-
- for(i = 0; i < 2; i++)
- x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
- x = _mm_div_ps(fones, x);
- y = fzeroes;
- for(j = TERMS - 1; j >= 0; j--)
- y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
-
- y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
- condition = _mm_cmpgt_ps(z, fones);
-
- y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
- arctangent = y;
- condition = _mm_cmplt_ps(aVal, fzeroes);
- arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
-
- _mm_storeu_ps(bPtr, arctangent);
- aPtr += 4;
- bPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = atanf(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ int i, j;
+
+ __m128 aVal, pio2, x, y, z, arctangent;
+ __m128 fzeroes, fones, ftwos, ffours, condition;
+
+ pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
+ fzeroes = _mm_setzero_ps();
+ fones = _mm_set1_ps(1.0);
+ ftwos = _mm_set1_ps(2.0);
+ ffours = _mm_set1_ps(4.0);
+
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_loadu_ps(aPtr);
+ z = aVal;
+ condition = _mm_cmplt_ps(z, fzeroes);
+ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+ condition = _mm_cmplt_ps(z, fones);
+ x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+ for (i = 0; i < 2; i++)
+ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+ x = _mm_div_ps(fones, x);
+ y = fzeroes;
+ for (j = TERMS - 1; j >= 0; j--)
+ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
+ _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
+
+ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+ condition = _mm_cmpgt_ps(z, fones);
+
+ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+ arctangent = y;
+ condition = _mm_cmplt_ps(aVal, fzeroes);
+ arctangent =
+ _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
+
+ _mm_storeu_ps(bPtr, arctangent);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *bPtr++ = atanf(*aPtr++);
+ }
}
#endif /* LV_HAVE_SSE4_1 for unaligned */
static inline void
volk_32f_atan_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- *bPtr++ = atanf(*aPtr++);
- }
+ for (number = 0; number < num_points; number++) {
+ *bPtr++ = atanf(*aPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_binary_slicer_32i(int* cVector, const float* aVector, unsigned int num_points)
- * \endcode
+ * void volk_32f_binary_slicer_32i(int* cVector, const float* aVector, unsigned int
+ * num_points) \endcode
*
* \b Inputs
* \li aVector: The input vector of floats.
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_binary_slicer_32i_generic(int* cVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_binary_slicer_32i_generic(int* cVector,
+ const float* aVector,
+ unsigned int num_points)
{
- int* cPtr = cVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- if( *aPtr++ >= 0) {
- *cPtr++ = 1;
- }
- else {
- *cPtr++ = 0;
+ int* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ if (*aPtr++ >= 0) {
+ *cPtr++ = 1;
+ } else {
+ *cPtr++ = 0;
+ }
}
- }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_binary_slicer_32i_generic_branchless(int* cVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_binary_slicer_32i_generic_branchless(int* cVector,
+ const float* aVector,
+ unsigned int num_points)
{
- int* cPtr = cVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
+ int* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++ >= 0);
- }
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = (*aPtr++ >= 0);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void
-volk_32f_binary_slicer_32i_a_sse2(int* cVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_binary_slicer_32i_a_sse2(int* cVector,
+ const float* aVector,
+ unsigned int num_points)
{
- int* cPtr = cVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
-
- unsigned int quarter_points = num_points / 4;
- __m128 a_val, res_f;
- __m128i res_i, binary_i;
- __m128 zero_val;
- zero_val = _mm_set1_ps (0.0f);
+ int* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
- for(number = 0; number < quarter_points; number++){
- a_val = _mm_load_ps(aPtr);
+ unsigned int quarter_points = num_points / 4;
+ __m128 a_val, res_f;
+ __m128i res_i, binary_i;
+ __m128 zero_val;
+ zero_val = _mm_set1_ps(0.0f);
- res_f = _mm_cmpge_ps (a_val, zero_val);
- res_i = _mm_cvtps_epi32 (res_f);
- binary_i = _mm_srli_epi32 (res_i, 31);
+ for (number = 0; number < quarter_points; number++) {
+ a_val = _mm_load_ps(aPtr);
- _mm_store_si128((__m128i*)cPtr, binary_i);
+ res_f = _mm_cmpge_ps(a_val, zero_val);
+ res_i = _mm_cvtps_epi32(res_f);
+ binary_i = _mm_srli_epi32(res_i, 31);
- cPtr += 4;
- aPtr += 4;
- }
+ _mm_store_si128((__m128i*)cPtr, binary_i);
- for(number = quarter_points * 4; number < num_points; number++){
- if( *aPtr++ >= 0) {
- *cPtr++ = 1;
+ cPtr += 4;
+ aPtr += 4;
}
- else {
- *cPtr++ = 0;
+
+ for (number = quarter_points * 4; number < num_points; number++) {
+ if (*aPtr++ >= 0) {
+ *cPtr++ = 1;
+ } else {
+ *cPtr++ = 0;
+ }
}
- }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_binary_slicer_32i_a_avx(int* cVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_binary_slicer_32i_a_avx(int* cVector,
+ const float* aVector,
+ unsigned int num_points)
{
- int* cPtr = cVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
+ int* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
- unsigned int quarter_points = num_points / 8;
- __m256 a_val, res_f, binary_f;
- __m256i binary_i;
- __m256 zero_val, one_val;
- zero_val = _mm256_set1_ps (0.0f);
- one_val = _mm256_set1_ps (1.0f);
+ unsigned int quarter_points = num_points / 8;
+ __m256 a_val, res_f, binary_f;
+ __m256i binary_i;
+ __m256 zero_val, one_val;
+ zero_val = _mm256_set1_ps(0.0f);
+ one_val = _mm256_set1_ps(1.0f);
- for(number = 0; number < quarter_points; number++){
- a_val = _mm256_load_ps(aPtr);
+ for (number = 0; number < quarter_points; number++) {
+ a_val = _mm256_load_ps(aPtr);
- res_f = _mm256_cmp_ps (a_val, zero_val, _CMP_GE_OS);
- binary_f = _mm256_and_ps (res_f, one_val);
- binary_i = _mm256_cvtps_epi32(binary_f);
+ res_f = _mm256_cmp_ps(a_val, zero_val, _CMP_GE_OS);
+ binary_f = _mm256_and_ps(res_f, one_val);
+ binary_i = _mm256_cvtps_epi32(binary_f);
- _mm256_store_si256((__m256i *)cPtr, binary_i);
+ _mm256_store_si256((__m256i*)cPtr, binary_i);
- cPtr += 8;
- aPtr += 8;
- }
-
- for(number = quarter_points * 8; number < num_points; number++){
- if( *aPtr++ >= 0) {
- *cPtr++ = 1;
+ cPtr += 8;
+ aPtr += 8;
}
- else {
- *cPtr++ = 0;
+
+ for (number = quarter_points * 8; number < num_points; number++) {
+ if (*aPtr++ >= 0) {
+ *cPtr++ = 1;
+ } else {
+ *cPtr++ = 0;
+ }
}
- }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void
-volk_32f_binary_slicer_32i_u_sse2(int* cVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_binary_slicer_32i_u_sse2(int* cVector,
+ const float* aVector,
+ unsigned int num_points)
{
- int* cPtr = cVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
-
- unsigned int quarter_points = num_points / 4;
- __m128 a_val, res_f;
- __m128i res_i, binary_i;
- __m128 zero_val;
- zero_val = _mm_set1_ps (0.0f);
+ int* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
- for(number = 0; number < quarter_points; number++){
- a_val = _mm_loadu_ps(aPtr);
+ unsigned int quarter_points = num_points / 4;
+ __m128 a_val, res_f;
+ __m128i res_i, binary_i;
+ __m128 zero_val;
+ zero_val = _mm_set1_ps(0.0f);
- res_f = _mm_cmpge_ps (a_val, zero_val);
- res_i = _mm_cvtps_epi32 (res_f);
- binary_i = _mm_srli_epi32 (res_i, 31);
+ for (number = 0; number < quarter_points; number++) {
+ a_val = _mm_loadu_ps(aPtr);
- _mm_storeu_si128((__m128i*)cPtr, binary_i);
+ res_f = _mm_cmpge_ps(a_val, zero_val);
+ res_i = _mm_cvtps_epi32(res_f);
+ binary_i = _mm_srli_epi32(res_i, 31);
- cPtr += 4;
- aPtr += 4;
- }
+ _mm_storeu_si128((__m128i*)cPtr, binary_i);
- for(number = quarter_points * 4; number < num_points; number++){
- if( *aPtr++ >= 0) {
- *cPtr++ = 1;
+ cPtr += 4;
+ aPtr += 4;
}
- else {
- *cPtr++ = 0;
+
+ for (number = quarter_points * 4; number < num_points; number++) {
+ if (*aPtr++ >= 0) {
+ *cPtr++ = 1;
+ } else {
+ *cPtr++ = 0;
+ }
}
- }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_binary_slicer_32i_u_avx(int* cVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_binary_slicer_32i_u_avx(int* cVector,
+ const float* aVector,
+ unsigned int num_points)
{
- int* cPtr = cVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
-
- unsigned int quarter_points = num_points / 8;
- __m256 a_val, res_f, binary_f;
- __m256i binary_i;
- __m256 zero_val, one_val;
- zero_val = _mm256_set1_ps (0.0f);
- one_val = _mm256_set1_ps (1.0f);
+ int* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
- for(number = 0; number < quarter_points; number++){
- a_val = _mm256_loadu_ps(aPtr);
+ unsigned int quarter_points = num_points / 8;
+ __m256 a_val, res_f, binary_f;
+ __m256i binary_i;
+ __m256 zero_val, one_val;
+ zero_val = _mm256_set1_ps(0.0f);
+ one_val = _mm256_set1_ps(1.0f);
- res_f = _mm256_cmp_ps (a_val, zero_val, _CMP_GE_OS);
- binary_f = _mm256_and_ps (res_f, one_val);
- binary_i = _mm256_cvtps_epi32(binary_f);
+ for (number = 0; number < quarter_points; number++) {
+ a_val = _mm256_loadu_ps(aPtr);
- _mm256_storeu_si256((__m256i*)cPtr, binary_i);
+ res_f = _mm256_cmp_ps(a_val, zero_val, _CMP_GE_OS);
+ binary_f = _mm256_and_ps(res_f, one_val);
+ binary_i = _mm256_cvtps_epi32(binary_f);
- cPtr += 8;
- aPtr += 8;
- }
+ _mm256_storeu_si256((__m256i*)cPtr, binary_i);
- for(number = quarter_points * 8; number < num_points; number++){
- if( *aPtr++ >= 0) {
- *cPtr++ = 1;
+ cPtr += 8;
+ aPtr += 8;
}
- else {
- *cPtr++ = 0;
+
+ for (number = quarter_points * 8; number < num_points; number++) {
+ if (*aPtr++ >= 0) {
+ *cPtr++ = 1;
+ } else {
+ *cPtr++ = 0;
+ }
}
- }
}
#endif /* LV_HAVE_AVX */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_binary_slicer_8i(int8_t* cVector, const float* aVector, unsigned int num_points)
+ * void volk_32f_binary_slicer_8i(int8_t* cVector, const float* aVector, unsigned int
+ num_points)
* \endcode
*
* \b Inputs
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_binary_slicer_8i_generic(int8_t* cVector, const float* aVector,
- unsigned int num_points)
+static inline void volk_32f_binary_slicer_8i_generic(int8_t* cVector,
+ const float* aVector,
+ unsigned int num_points)
{
- int8_t* cPtr = cVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++) {
- if(*aPtr++ >= 0) {
- *cPtr++ = 1;
- }
- else {
- *cPtr++ = 0;
+ int8_t* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ if (*aPtr++ >= 0) {
+ *cPtr++ = 1;
+ } else {
+ *cPtr++ = 0;
+ }
}
- }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector, const float* aVector,
- unsigned int num_points)
+static inline void volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector,
+ const float* aVector,
+ unsigned int num_points)
{
- int8_t* cPtr = cVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
+ int8_t* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++ >= 0);
- }
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = (*aPtr++ >= 0);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_32f_binary_slicer_8i_a_avx2(int8_t* cVector, const float* aVector,
- unsigned int num_points)
+static inline void volk_32f_binary_slicer_8i_a_avx2(int8_t* cVector,
+ const float* aVector,
+ unsigned int num_points)
{
- int8_t* cPtr = cVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
- unsigned int n32points = num_points / 32;
-
- const __m256 zero_val = _mm256_set1_ps(0.0f);
- __m256 a0_val, a1_val, a2_val, a3_val;
- __m256 res0_f, res1_f, res2_f, res3_f;
- __m256i res0_i, res1_i, res2_i, res3_i;
- __m256i byte_shuffle = _mm256_set_epi8( 15, 14, 13, 12, 7, 6, 5, 4,
- 11, 10, 9, 8, 3, 2, 1, 0,
- 15, 14, 13, 12, 7, 6, 5, 4,
- 11, 10, 9, 8, 3, 2, 1, 0);
-
- for(number = 0; number < n32points; number++) {
- a0_val = _mm256_load_ps(aPtr);
- a1_val = _mm256_load_ps(aPtr+8);
- a2_val = _mm256_load_ps(aPtr+16);
- a3_val = _mm256_load_ps(aPtr+24);
-
- // compare >= 0; return float
- res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
- res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
- res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
- res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
-
- // convert to 32i and >> 31
- res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
- res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
- res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
- res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
-
- // pack in to 16-bit results
- res0_i = _mm256_packs_epi32(res0_i, res1_i);
- res2_i = _mm256_packs_epi32(res2_i, res3_i);
- // pack in to 8-bit results
- // res0: (after packs_epi32)
- // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
- // res2:
- // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
- res0_i = _mm256_packs_epi16(res0_i, res2_i);
- // shuffle the lanes
- // res0: (after packs_epi16)
- // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
- // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
- // 0, 2, 1, 3 -> 11 01 10 00 (0xd8)
- res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
-
- // shuffle bytes within lanes
- // res0: (after shuffle_epi8)
- // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
- // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
- res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
-
- _mm256_store_si256((__m256i*)cPtr, res0_i);
- aPtr += 32;
- cPtr += 32;
- }
-
- for(number = n32points * 32; number < num_points; number++) {
- if( *aPtr++ >= 0) {
- *cPtr++ = 1;
+ int8_t* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+ unsigned int n32points = num_points / 32;
+
+ const __m256 zero_val = _mm256_set1_ps(0.0f);
+ __m256 a0_val, a1_val, a2_val, a3_val;
+ __m256 res0_f, res1_f, res2_f, res3_f;
+ __m256i res0_i, res1_i, res2_i, res3_i;
+ __m256i byte_shuffle = _mm256_set_epi8(15,
+ 14,
+ 13,
+ 12,
+ 7,
+ 6,
+ 5,
+ 4,
+ 11,
+ 10,
+ 9,
+ 8,
+ 3,
+ 2,
+ 1,
+ 0,
+ 15,
+ 14,
+ 13,
+ 12,
+ 7,
+ 6,
+ 5,
+ 4,
+ 11,
+ 10,
+ 9,
+ 8,
+ 3,
+ 2,
+ 1,
+ 0);
+
+ for (number = 0; number < n32points; number++) {
+ a0_val = _mm256_load_ps(aPtr);
+ a1_val = _mm256_load_ps(aPtr + 8);
+ a2_val = _mm256_load_ps(aPtr + 16);
+ a3_val = _mm256_load_ps(aPtr + 24);
+
+ // compare >= 0; return float
+ res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
+ res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
+ res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
+ res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
+
+ // convert to 32i and >> 31
+ res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
+ res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
+ res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
+ res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
+
+ // pack in to 16-bit results
+ res0_i = _mm256_packs_epi32(res0_i, res1_i);
+ res2_i = _mm256_packs_epi32(res2_i, res3_i);
+ // pack in to 8-bit results
+ // res0: (after packs_epi32)
+ // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
+ // res2:
+ // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
+ res0_i = _mm256_packs_epi16(res0_i, res2_i);
+ // shuffle the lanes
+ // res0: (after packs_epi16)
+ // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
+ // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
+ // 0, 2, 1, 3 -> 11 01 10 00 (0xd8)
+ res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
+
+ // shuffle bytes within lanes
+ // res0: (after shuffle_epi8)
+ // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
+ // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
+ res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
+
+ _mm256_store_si256((__m256i*)cPtr, res0_i);
+ aPtr += 32;
+ cPtr += 32;
}
- else {
- *cPtr++ = 0;
+
+ for (number = n32points * 32; number < num_points; number++) {
+ if (*aPtr++ >= 0) {
+ *cPtr++ = 1;
+ } else {
+ *cPtr++ = 0;
+ }
}
- }
}
#endif
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_32f_binary_slicer_8i_u_avx2(int8_t* cVector, const float* aVector,
- unsigned int num_points)
+static inline void volk_32f_binary_slicer_8i_u_avx2(int8_t* cVector,
+ const float* aVector,
+ unsigned int num_points)
{
- int8_t* cPtr = cVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
- unsigned int n32points = num_points / 32;
-
- const __m256 zero_val = _mm256_set1_ps(0.0f);
- __m256 a0_val, a1_val, a2_val, a3_val;
- __m256 res0_f, res1_f, res2_f, res3_f;
- __m256i res0_i, res1_i, res2_i, res3_i;
- __m256i byte_shuffle = _mm256_set_epi8( 15, 14, 13, 12, 7, 6, 5, 4,
- 11, 10, 9, 8, 3, 2, 1, 0,
- 15, 14, 13, 12, 7, 6, 5, 4,
- 11, 10, 9, 8, 3, 2, 1, 0);
-
- for(number = 0; number < n32points; number++) {
- a0_val = _mm256_loadu_ps(aPtr);
- a1_val = _mm256_loadu_ps(aPtr+8);
- a2_val = _mm256_loadu_ps(aPtr+16);
- a3_val = _mm256_loadu_ps(aPtr+24);
-
- // compare >= 0; return float
- res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
- res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
- res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
- res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
-
- // convert to 32i and >> 31
- res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
- res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
- res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
- res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
-
- // pack in to 16-bit results
- res0_i = _mm256_packs_epi32(res0_i, res1_i);
- res2_i = _mm256_packs_epi32(res2_i, res3_i);
- // pack in to 8-bit results
- // res0: (after packs_epi32)
- // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
- // res2:
- // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
- res0_i = _mm256_packs_epi16(res0_i, res2_i);
- // shuffle the lanes
- // res0: (after packs_epi16)
- // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
- // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
- // 0, 2, 1, 3 -> 11 01 10 00 (0xd8)
- res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
-
- // shuffle bytes within lanes
- // res0: (after shuffle_epi8)
- // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
- // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
- res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
-
- _mm256_storeu_si256((__m256i*)cPtr, res0_i);
- aPtr += 32;
- cPtr += 32;
- }
-
- for(number = n32points * 32; number < num_points; number++) {
- if( *aPtr++ >= 0) {
- *cPtr++ = 1;
+ int8_t* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+ unsigned int n32points = num_points / 32;
+
+ const __m256 zero_val = _mm256_set1_ps(0.0f);
+ __m256 a0_val, a1_val, a2_val, a3_val;
+ __m256 res0_f, res1_f, res2_f, res3_f;
+ __m256i res0_i, res1_i, res2_i, res3_i;
+ __m256i byte_shuffle = _mm256_set_epi8(15,
+ 14,
+ 13,
+ 12,
+ 7,
+ 6,
+ 5,
+ 4,
+ 11,
+ 10,
+ 9,
+ 8,
+ 3,
+ 2,
+ 1,
+ 0,
+ 15,
+ 14,
+ 13,
+ 12,
+ 7,
+ 6,
+ 5,
+ 4,
+ 11,
+ 10,
+ 9,
+ 8,
+ 3,
+ 2,
+ 1,
+ 0);
+
+ for (number = 0; number < n32points; number++) {
+ a0_val = _mm256_loadu_ps(aPtr);
+ a1_val = _mm256_loadu_ps(aPtr + 8);
+ a2_val = _mm256_loadu_ps(aPtr + 16);
+ a3_val = _mm256_loadu_ps(aPtr + 24);
+
+ // compare >= 0; return float
+ res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
+ res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
+ res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
+ res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
+
+ // convert to 32i and >> 31
+ res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
+ res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
+ res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
+ res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
+
+ // pack in to 16-bit results
+ res0_i = _mm256_packs_epi32(res0_i, res1_i);
+ res2_i = _mm256_packs_epi32(res2_i, res3_i);
+ // pack in to 8-bit results
+ // res0: (after packs_epi32)
+ // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
+ // res2:
+ // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
+ res0_i = _mm256_packs_epi16(res0_i, res2_i);
+ // shuffle the lanes
+ // res0: (after packs_epi16)
+ // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
+ // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
+ // 0, 2, 1, 3 -> 11 01 10 00 (0xd8)
+ res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
+
+ // shuffle bytes within lanes
+ // res0: (after shuffle_epi8)
+ // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
+ // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
+ res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
+
+ _mm256_storeu_si256((__m256i*)cPtr, res0_i);
+ aPtr += 32;
+ cPtr += 32;
}
- else {
- *cPtr++ = 0;
+
+ for (number = n32points * 32; number < num_points; number++) {
+ if (*aPtr++ >= 0) {
+ *cPtr++ = 1;
+ } else {
+ *cPtr++ = 0;
+ }
}
- }
}
#endif
-
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void
-volk_32f_binary_slicer_8i_a_sse2(int8_t* cVector, const float* aVector,
- unsigned int num_points)
+static inline void volk_32f_binary_slicer_8i_a_sse2(int8_t* cVector,
+ const float* aVector,
+ unsigned int num_points)
{
- int8_t* cPtr = cVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
-
- unsigned int n16points = num_points / 16;
- __m128 a0_val, a1_val, a2_val, a3_val;
- __m128 res0_f, res1_f, res2_f, res3_f;
- __m128i res0_i, res1_i, res2_i, res3_i;
- __m128 zero_val;
- zero_val = _mm_set1_ps(0.0f);
-
- for(number = 0; number < n16points; number++) {
- a0_val = _mm_load_ps(aPtr);
- a1_val = _mm_load_ps(aPtr+4);
- a2_val = _mm_load_ps(aPtr+8);
- a3_val = _mm_load_ps(aPtr+12);
-
- // compare >= 0; return float
- res0_f = _mm_cmpge_ps(a0_val, zero_val);
- res1_f = _mm_cmpge_ps(a1_val, zero_val);
- res2_f = _mm_cmpge_ps(a2_val, zero_val);
- res3_f = _mm_cmpge_ps(a3_val, zero_val);
-
- // convert to 32i and >> 31
- res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
- res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
- res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
- res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
-
- // pack into 16-bit results
- res0_i = _mm_packs_epi32(res0_i, res1_i);
- res2_i = _mm_packs_epi32(res2_i, res3_i);
-
- // pack into 8-bit results
- res0_i = _mm_packs_epi16(res0_i, res2_i);
-
- _mm_store_si128((__m128i*)cPtr, res0_i);
-
- cPtr += 16;
- aPtr += 16;
- }
-
- for(number = n16points * 16; number < num_points; number++) {
- if( *aPtr++ >= 0) {
- *cPtr++ = 1;
+ int8_t* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ unsigned int n16points = num_points / 16;
+ __m128 a0_val, a1_val, a2_val, a3_val;
+ __m128 res0_f, res1_f, res2_f, res3_f;
+ __m128i res0_i, res1_i, res2_i, res3_i;
+ __m128 zero_val;
+ zero_val = _mm_set1_ps(0.0f);
+
+ for (number = 0; number < n16points; number++) {
+ a0_val = _mm_load_ps(aPtr);
+ a1_val = _mm_load_ps(aPtr + 4);
+ a2_val = _mm_load_ps(aPtr + 8);
+ a3_val = _mm_load_ps(aPtr + 12);
+
+ // compare >= 0; return float
+ res0_f = _mm_cmpge_ps(a0_val, zero_val);
+ res1_f = _mm_cmpge_ps(a1_val, zero_val);
+ res2_f = _mm_cmpge_ps(a2_val, zero_val);
+ res3_f = _mm_cmpge_ps(a3_val, zero_val);
+
+ // convert to 32i and >> 31
+ res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
+ res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
+ res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
+ res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
+
+ // pack into 16-bit results
+ res0_i = _mm_packs_epi32(res0_i, res1_i);
+ res2_i = _mm_packs_epi32(res2_i, res3_i);
+
+ // pack into 8-bit results
+ res0_i = _mm_packs_epi16(res0_i, res2_i);
+
+ _mm_store_si128((__m128i*)cPtr, res0_i);
+
+ cPtr += 16;
+ aPtr += 16;
}
- else {
- *cPtr++ = 0;
+
+ for (number = n16points * 16; number < num_points; number++) {
+ if (*aPtr++ >= 0) {
+ *cPtr++ = 1;
+ } else {
+ *cPtr++ = 0;
+ }
}
- }
}
#endif /* LV_HAVE_SSE2 */
-
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void
-volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector, const float* aVector,
- unsigned int num_points)
+static inline void volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector,
+ const float* aVector,
+ unsigned int num_points)
{
- int8_t* cPtr = cVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
-
- unsigned int n16points = num_points / 16;
- __m128 a0_val, a1_val, a2_val, a3_val;
- __m128 res0_f, res1_f, res2_f, res3_f;
- __m128i res0_i, res1_i, res2_i, res3_i;
- __m128 zero_val;
- zero_val = _mm_set1_ps (0.0f);
-
- for(number = 0; number < n16points; number++) {
- a0_val = _mm_loadu_ps(aPtr);
- a1_val = _mm_loadu_ps(aPtr+4);
- a2_val = _mm_loadu_ps(aPtr+8);
- a3_val = _mm_loadu_ps(aPtr+12);
-
- // compare >= 0; return float
- res0_f = _mm_cmpge_ps(a0_val, zero_val);
- res1_f = _mm_cmpge_ps(a1_val, zero_val);
- res2_f = _mm_cmpge_ps(a2_val, zero_val);
- res3_f = _mm_cmpge_ps(a3_val, zero_val);
-
- // convert to 32i and >> 31
- res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
- res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
- res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
- res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
-
- // pack into 16-bit results
- res0_i = _mm_packs_epi32(res0_i, res1_i);
- res2_i = _mm_packs_epi32(res2_i, res3_i);
-
- // pack into 8-bit results
- res0_i = _mm_packs_epi16(res0_i, res2_i);
-
- _mm_storeu_si128((__m128i*)cPtr, res0_i);
-
- cPtr += 16;
- aPtr += 16;
- }
-
- for(number = n16points * 16; number < num_points; number++) {
- if( *aPtr++ >= 0) {
- *cPtr++ = 1;
+ int8_t* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ unsigned int n16points = num_points / 16;
+ __m128 a0_val, a1_val, a2_val, a3_val;
+ __m128 res0_f, res1_f, res2_f, res3_f;
+ __m128i res0_i, res1_i, res2_i, res3_i;
+ __m128 zero_val;
+ zero_val = _mm_set1_ps(0.0f);
+
+ for (number = 0; number < n16points; number++) {
+ a0_val = _mm_loadu_ps(aPtr);
+ a1_val = _mm_loadu_ps(aPtr + 4);
+ a2_val = _mm_loadu_ps(aPtr + 8);
+ a3_val = _mm_loadu_ps(aPtr + 12);
+
+ // compare >= 0; return float
+ res0_f = _mm_cmpge_ps(a0_val, zero_val);
+ res1_f = _mm_cmpge_ps(a1_val, zero_val);
+ res2_f = _mm_cmpge_ps(a2_val, zero_val);
+ res3_f = _mm_cmpge_ps(a3_val, zero_val);
+
+ // convert to 32i and >> 31
+ res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
+ res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
+ res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
+ res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
+
+ // pack into 16-bit results
+ res0_i = _mm_packs_epi32(res0_i, res1_i);
+ res2_i = _mm_packs_epi32(res2_i, res3_i);
+
+ // pack into 8-bit results
+ res0_i = _mm_packs_epi16(res0_i, res2_i);
+
+ _mm_storeu_si128((__m128i*)cPtr, res0_i);
+
+ cPtr += 16;
+ aPtr += 16;
}
- else {
- *cPtr++ = 0;
+
+ for (number = n16points * 16; number < num_points; number++) {
+ if (*aPtr++ >= 0) {
+ *cPtr++ = 1;
+ } else {
+ *cPtr++ = 0;
+ }
}
- }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_32f_binary_slicer_8i_neon(int8_t* cVector, const float* aVector,
- unsigned int num_points)
+static inline void volk_32f_binary_slicer_8i_neon(int8_t* cVector,
+ const float* aVector,
+ unsigned int num_points)
{
- int8_t* cPtr = cVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
- unsigned int n16points = num_points / 16;
-
- float32x4x2_t input_val0, input_val1;
- float32x4_t zero_val;
- uint32x4x2_t res0_u32, res1_u32;
- uint16x4x2_t res0_u16x4, res1_u16x4;
- uint16x8x2_t res_u16x8;
- uint8x8x2_t res_u8;
- uint8x8_t one;
-
- zero_val = vdupq_n_f32(0.0);
- one = vdup_n_u8(0x01);
-
- // TODO: this is a good candidate for asm because the vcombines
- // can be eliminated simply by picking dst registers that are
- // adjacent.
- for(number = 0; number < n16points; number++) {
- input_val0 = vld2q_f32(aPtr);
- input_val1 = vld2q_f32(aPtr+8);
-
- // test against 0; return uint32
- res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
- res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
- res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
- res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
-
- // narrow uint32 -> uint16 followed by combine to 8-element vectors
- res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
- res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
- res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
- res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
-
- res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
- res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
-
- // narrow uint16x8 -> uint8x8
- res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
- res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
- // we *could* load twice as much data and do another vcombine here
- // to get a uint8x16x2 vector, still only do 2 vandqs and a single store
- // but that turns out to be ~16% slower than this version on zc702
- // it's possible register contention in GCC scheduler slows it down
- // and a hand-written asm with quad-word u8 registers is much faster.
-
- res_u8.val[0] = vand_u8(one, res_u8.val[0]);
- res_u8.val[1] = vand_u8(one, res_u8.val[1]);
-
- vst2_u8((unsigned char*)cPtr, res_u8);
- cPtr += 16;
- aPtr += 16;
-
- }
-
- for(number = n16points * 16; number < num_points; number++) {
- if(*aPtr++ >= 0) {
- *cPtr++ = 1;
+ int8_t* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+ unsigned int n16points = num_points / 16;
+
+ float32x4x2_t input_val0, input_val1;
+ float32x4_t zero_val;
+ uint32x4x2_t res0_u32, res1_u32;
+ uint16x4x2_t res0_u16x4, res1_u16x4;
+ uint16x8x2_t res_u16x8;
+ uint8x8x2_t res_u8;
+ uint8x8_t one;
+
+ zero_val = vdupq_n_f32(0.0);
+ one = vdup_n_u8(0x01);
+
+ // TODO: this is a good candidate for asm because the vcombines
+ // can be eliminated simply by picking dst registers that are
+ // adjacent.
+ for (number = 0; number < n16points; number++) {
+ input_val0 = vld2q_f32(aPtr);
+ input_val1 = vld2q_f32(aPtr + 8);
+
+ // test against 0; return uint32
+ res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
+ res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
+ res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
+ res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
+
+ // narrow uint32 -> uint16 followed by combine to 8-element vectors
+ res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
+ res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
+ res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
+ res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
+
+ res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
+ res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
+
+ // narrow uint16x8 -> uint8x8
+ res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
+ res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
+ // we *could* load twice as much data and do another vcombine here
+ // to get a uint8x16x2 vector, still only do 2 vandqs and a single store
+ // but that turns out to be ~16% slower than this version on zc702
+ // it's possible register contention in GCC scheduler slows it down
+ // and a hand-written asm with quad-word u8 registers is much faster.
+
+ res_u8.val[0] = vand_u8(one, res_u8.val[0]);
+ res_u8.val[1] = vand_u8(one, res_u8.val[1]);
+
+ vst2_u8((unsigned char*)cPtr, res_u8);
+ cPtr += 16;
+ aPtr += 16;
}
- else {
- *cPtr++ = 0;
+
+ for (number = n16points * 16; number < num_points; number++) {
+ if (*aPtr++ >= 0) {
+ *cPtr++ = 1;
+ } else {
+ *cPtr++ = 0;
+ }
}
- }
}
#endif /* LV_HAVE_NEON */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_convert_64f(double* outputVector, const float* inputVector, unsigned int num_points)
- * \endcode
+ * void volk_32f_convert_64f(double* outputVector, const float* inputVector, unsigned int
+ * num_points) \endcode
*
* \b Inputs
* \li inputVector: The vector of floats to convert to doubles.
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void volk_32f_convert_64f_u_avx(double* outputVector, const float* inputVector, unsigned int num_points){
- unsigned int number = 0;
+static inline void volk_32f_convert_64f_u_avx(double* outputVector,
+ const float* inputVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int quarterPoints = num_points / 4;
- const float* inputVectorPtr = (const float*)inputVector;
- double* outputVectorPtr = outputVector;
- __m256d ret;
- __m128 inputVal;
+ const float* inputVectorPtr = (const float*)inputVector;
+ double* outputVectorPtr = outputVector;
+ __m256d ret;
+ __m128 inputVal;
- for(;number < quarterPoints; number++){
- inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+ for (; number < quarterPoints; number++) {
+ inputVal = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
- ret = _mm256_cvtps_pd(inputVal);
- _mm256_storeu_pd(outputVectorPtr, ret);
+ ret = _mm256_cvtps_pd(inputVal);
+ _mm256_storeu_pd(outputVectorPtr, ret);
- outputVectorPtr += 4;
- }
+ outputVectorPtr += 4;
+ }
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] = (double)(inputVector[number]);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ outputVector[number] = (double)(inputVector[number]);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
- unsigned int number = 0;
+static inline void volk_32f_convert_64f_u_sse2(double* outputVector,
+ const float* inputVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int quarterPoints = num_points / 4;
- const float* inputVectorPtr = (const float*)inputVector;
- double* outputVectorPtr = outputVector;
- __m128d ret;
- __m128 inputVal;
+ const float* inputVectorPtr = (const float*)inputVector;
+ double* outputVectorPtr = outputVector;
+ __m128d ret;
+ __m128 inputVal;
- for(;number < quarterPoints; number++){
- inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+ for (; number < quarterPoints; number++) {
+ inputVal = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
- ret = _mm_cvtps_pd(inputVal);
+ ret = _mm_cvtps_pd(inputVal);
- _mm_storeu_pd(outputVectorPtr, ret);
- outputVectorPtr += 2;
+ _mm_storeu_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
- inputVal = _mm_movehl_ps(inputVal, inputVal);
+ inputVal = _mm_movehl_ps(inputVal, inputVal);
- ret = _mm_cvtps_pd(inputVal);
+ ret = _mm_cvtps_pd(inputVal);
- _mm_storeu_pd(outputVectorPtr, ret);
- outputVectorPtr += 2;
- }
+ _mm_storeu_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+ }
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] = (double)(inputVector[number]);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ outputVector[number] = (double)(inputVector[number]);
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
-static inline void volk_32f_convert_64f_generic(double* outputVector, const float* inputVector, unsigned int num_points){
- double* outputVectorPtr = outputVector;
- const float* inputVectorPtr = inputVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((double)(*inputVectorPtr++));
- }
+static inline void volk_32f_convert_64f_generic(double* outputVector,
+ const float* inputVector,
+ unsigned int num_points)
+{
+ double* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+ }
}
#endif /* LV_HAVE_GENERIC */
-
-
#endif /* INCLUDED_volk_32f_convert_64f_u_H */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void volk_32f_convert_64f_a_avx(double* outputVector, const float* inputVector, unsigned int num_points){
- unsigned int number = 0;
+static inline void volk_32f_convert_64f_a_avx(double* outputVector,
+ const float* inputVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int quarterPoints = num_points / 4;
- const float* inputVectorPtr = (const float*)inputVector;
- double* outputVectorPtr = outputVector;
- __m256d ret;
- __m128 inputVal;
+ const float* inputVectorPtr = (const float*)inputVector;
+ double* outputVectorPtr = outputVector;
+ __m256d ret;
+ __m128 inputVal;
- for(;number < quarterPoints; number++){
- inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ for (; number < quarterPoints; number++) {
+ inputVal = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
- ret = _mm256_cvtps_pd(inputVal);
- _mm256_store_pd(outputVectorPtr, ret);
+ ret = _mm256_cvtps_pd(inputVal);
+ _mm256_store_pd(outputVectorPtr, ret);
- outputVectorPtr += 4;
- }
+ outputVectorPtr += 4;
+ }
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] = (double)(inputVector[number]);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ outputVector[number] = (double)(inputVector[number]);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
- unsigned int number = 0;
+static inline void volk_32f_convert_64f_a_sse2(double* outputVector,
+ const float* inputVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int quarterPoints = num_points / 4;
- const float* inputVectorPtr = (const float*)inputVector;
- double* outputVectorPtr = outputVector;
- __m128d ret;
- __m128 inputVal;
+ const float* inputVectorPtr = (const float*)inputVector;
+ double* outputVectorPtr = outputVector;
+ __m128d ret;
+ __m128 inputVal;
- for(;number < quarterPoints; number++){
- inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ for (; number < quarterPoints; number++) {
+ inputVal = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
- ret = _mm_cvtps_pd(inputVal);
+ ret = _mm_cvtps_pd(inputVal);
- _mm_store_pd(outputVectorPtr, ret);
- outputVectorPtr += 2;
+ _mm_store_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
- inputVal = _mm_movehl_ps(inputVal, inputVal);
+ inputVal = _mm_movehl_ps(inputVal, inputVal);
- ret = _mm_cvtps_pd(inputVal);
+ ret = _mm_cvtps_pd(inputVal);
- _mm_store_pd(outputVectorPtr, ret);
- outputVectorPtr += 2;
- }
+ _mm_store_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+ }
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] = (double)(inputVector[number]);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ outputVector[number] = (double)(inputVector[number]);
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
-static inline void volk_32f_convert_64f_a_generic(double* outputVector, const float* inputVector, unsigned int num_points){
- double* outputVectorPtr = outputVector;
- const float* inputVectorPtr = inputVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((double)(*inputVectorPtr++));
- }
+static inline void volk_32f_convert_64f_a_generic(double* outputVector,
+ const float* inputVector,
+ unsigned int num_points)
+{
+ double* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+ }
}
#endif /* LV_HAVE_GENERIC */
-
-
#endif /* INCLUDED_volk_32f_convert_64f_a_H */
* \endcode
*/
-#include <stdio.h>
-#include <math.h>
#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
#ifndef INCLUDED_volk_32f_cos_32f_a_H
#define INCLUDED_volk_32f_cos_32f_a_H
#include <immintrin.h>
static inline void
- volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int eighthPoints = num_points / 8;
- unsigned int i = 0;
-
- __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
- __m256 sine, cosine;
- __m256i q, ones, twos, fours;
-
- m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
- pio4A = _mm256_set1_ps(0.7853981554508209228515625);
- pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
- pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
- ffours = _mm256_set1_ps(4.0);
- ftwos = _mm256_set1_ps(2.0);
- fones = _mm256_set1_ps(1.0);
- fzeroes = _mm256_setzero_ps();
- __m256i zeroes = _mm256_set1_epi32(0);
- ones = _mm256_set1_epi32(1);
- __m256i allones = _mm256_set1_epi32(0xffffffff);
- twos = _mm256_set1_epi32(2);
- fours = _mm256_set1_epi32(4);
-
- cp1 = _mm256_set1_ps(1.0);
- cp2 = _mm256_set1_ps(0.08333333333333333);
- cp3 = _mm256_set1_ps(0.002777777777777778);
- cp4 = _mm256_set1_ps(4.96031746031746e-05);
- cp5 = _mm256_set1_ps(5.511463844797178e-07);
- union bit256 condition1;
- union bit256 condition3;
-
- for(;number < eighthPoints; number++){
-
- aVal = _mm256_load_ps(aPtr);
- // s = fabs(aVal)
- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
- // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
- // r = q + q&1, q indicates quadrant, r gives
- r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
-
- s = _mm256_fnmadd_ps(r,pio4A,s);
- s = _mm256_fnmadd_ps(r,pio4B,s);
- s = _mm256_fnmadd_ps(r,pio4C,s);
-
- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
- s = _mm256_mul_ps(s, s);
- // Evaluate Taylor series
- s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
-
- for(i = 0; i < 3; i++)
- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
- s = _mm256_div_ps(s, ftwos);
-
- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
- cosine = _mm256_sub_ps(fones, s);
-
- // if(((q+1)&2) != 0) { cosine=sine;}
- condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
- condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
-
- // if(((q+2)&4) != 0) { cosine = -cosine;}
- condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
- condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
-
- cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
- cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3.float_vec));
- _mm256_store_ps(bPtr, cosine);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = cos(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int eighthPoints = num_points / 8;
+ unsigned int i = 0;
+
+ __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
+ fones, fzeroes;
+ __m256 sine, cosine;
+ __m256i q, ones, twos, fours;
+
+ m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
+ pio4A = _mm256_set1_ps(0.7853981554508209228515625);
+ pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
+ pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+ ffours = _mm256_set1_ps(4.0);
+ ftwos = _mm256_set1_ps(2.0);
+ fones = _mm256_set1_ps(1.0);
+ fzeroes = _mm256_setzero_ps();
+ __m256i zeroes = _mm256_set1_epi32(0);
+ ones = _mm256_set1_epi32(1);
+ __m256i allones = _mm256_set1_epi32(0xffffffff);
+ twos = _mm256_set1_epi32(2);
+ fours = _mm256_set1_epi32(4);
+
+ cp1 = _mm256_set1_ps(1.0);
+ cp2 = _mm256_set1_ps(0.08333333333333333);
+ cp3 = _mm256_set1_ps(0.002777777777777778);
+ cp4 = _mm256_set1_ps(4.96031746031746e-05);
+ cp5 = _mm256_set1_ps(5.511463844797178e-07);
+ union bit256 condition1;
+ union bit256 condition3;
+
+ for (; number < eighthPoints; number++) {
+
+ aVal = _mm256_load_ps(aPtr);
+ // s = fabs(aVal)
+ s = _mm256_sub_ps(aVal,
+ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+ // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+ // r = q + q&1, q indicates quadrant, r gives
+ r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
+
+ s = _mm256_fnmadd_ps(r, pio4A, s);
+ s = _mm256_fnmadd_ps(r, pio4B, s);
+ s = _mm256_fnmadd_ps(r, pio4C, s);
+
+ s = _mm256_div_ps(
+ s,
+ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+ s = _mm256_mul_ps(s, s);
+ // Evaluate Taylor series
+ s = _mm256_mul_ps(
+ _mm256_fmadd_ps(
+ _mm256_fmsub_ps(
+ _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
+ s,
+ cp1),
+ s);
+
+ for (i = 0; i < 3; i++)
+ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+ s = _mm256_div_ps(s, ftwos);
+
+ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+ cosine = _mm256_sub_ps(fones, s);
+
+ // if(((q+1)&2) != 0) { cosine=sine;}
+ condition1.int_vec =
+ _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
+ condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
+
+ // if(((q+2)&4) != 0) { cosine = -cosine;}
+ condition3.int_vec = _mm256_cmpeq_epi32(
+ _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
+ condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
+
+ cosine = _mm256_add_ps(
+ cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
+ cosine = _mm256_sub_ps(cosine,
+ _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
+ condition3.float_vec));
+ _mm256_store_ps(bPtr, cosine);
+ aPtr += 8;
+ bPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = cos(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
#include <immintrin.h>
static inline void
- volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
+volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int eighthPoints = num_points / 8;
- unsigned int i = 0;
-
- __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
- __m256 sine, cosine;
- __m256i q, ones, twos, fours;
-
- m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
- pio4A = _mm256_set1_ps(0.7853981554508209228515625);
- pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
- pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
- ffours = _mm256_set1_ps(4.0);
- ftwos = _mm256_set1_ps(2.0);
- fones = _mm256_set1_ps(1.0);
- fzeroes = _mm256_setzero_ps();
- __m256i zeroes = _mm256_set1_epi32(0);
- ones = _mm256_set1_epi32(1);
- __m256i allones = _mm256_set1_epi32(0xffffffff);
- twos = _mm256_set1_epi32(2);
- fours = _mm256_set1_epi32(4);
-
- cp1 = _mm256_set1_ps(1.0);
- cp2 = _mm256_set1_ps(0.08333333333333333);
- cp3 = _mm256_set1_ps(0.002777777777777778);
- cp4 = _mm256_set1_ps(4.96031746031746e-05);
- cp5 = _mm256_set1_ps(5.511463844797178e-07);
- union bit256 condition1;
- union bit256 condition3;
-
- for(;number < eighthPoints; number++){
-
- aVal = _mm256_load_ps(aPtr);
- // s = fabs(aVal)
- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
- // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
- // r = q + q&1, q indicates quadrant, r gives
- r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
-
- s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4A));
- s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4B));
- s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4C));
-
- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
- s = _mm256_mul_ps(s, s);
- // Evaluate Taylor series
- s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
- for(i = 0; i < 3; i++)
- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
- s = _mm256_div_ps(s, ftwos);
-
- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
- cosine = _mm256_sub_ps(fones, s);
-
- // if(((q+1)&2) != 0) { cosine=sine;}
- condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
- condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
-
- // if(((q+2)&4) != 0) { cosine = -cosine;}
- condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
- condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
-
- cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
- cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3.float_vec));
- _mm256_store_ps(bPtr, cosine);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = cos(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int eighthPoints = num_points / 8;
+ unsigned int i = 0;
+
+ __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
+ fones, fzeroes;
+ __m256 sine, cosine;
+ __m256i q, ones, twos, fours;
+
+ m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
+ pio4A = _mm256_set1_ps(0.7853981554508209228515625);
+ pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
+ pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+ ffours = _mm256_set1_ps(4.0);
+ ftwos = _mm256_set1_ps(2.0);
+ fones = _mm256_set1_ps(1.0);
+ fzeroes = _mm256_setzero_ps();
+ __m256i zeroes = _mm256_set1_epi32(0);
+ ones = _mm256_set1_epi32(1);
+ __m256i allones = _mm256_set1_epi32(0xffffffff);
+ twos = _mm256_set1_epi32(2);
+ fours = _mm256_set1_epi32(4);
+
+ cp1 = _mm256_set1_ps(1.0);
+ cp2 = _mm256_set1_ps(0.08333333333333333);
+ cp3 = _mm256_set1_ps(0.002777777777777778);
+ cp4 = _mm256_set1_ps(4.96031746031746e-05);
+ cp5 = _mm256_set1_ps(5.511463844797178e-07);
+ union bit256 condition1;
+ union bit256 condition3;
+
+ for (; number < eighthPoints; number++) {
+
+ aVal = _mm256_load_ps(aPtr);
+ // s = fabs(aVal)
+ s = _mm256_sub_ps(aVal,
+ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+ // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+ // r = q + q&1, q indicates quadrant, r gives
+ r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
+
+ s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
+ s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
+ s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
+
+ s = _mm256_div_ps(
+ s,
+ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+ s = _mm256_mul_ps(s, s);
+ // Evaluate Taylor series
+ s = _mm256_mul_ps(
+ _mm256_add_ps(
+ _mm256_mul_ps(
+ _mm256_sub_ps(
+ _mm256_mul_ps(
+ _mm256_add_ps(
+ _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
+ s),
+ cp3),
+ s),
+ cp2),
+ s),
+ cp1),
+ s);
+
+ for (i = 0; i < 3; i++)
+ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+ s = _mm256_div_ps(s, ftwos);
+
+ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+ cosine = _mm256_sub_ps(fones, s);
+
+ // if(((q+1)&2) != 0) { cosine=sine;}
+ condition1.int_vec =
+ _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
+ condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
+
+ // if(((q+2)&4) != 0) { cosine = -cosine;}
+ condition3.int_vec = _mm256_cmpeq_epi32(
+ _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
+ condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
+
+ cosine = _mm256_add_ps(
+ cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
+ cosine = _mm256_sub_ps(cosine,
+ _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
+ condition3.float_vec));
+ _mm256_store_ps(bPtr, cosine);
+ aPtr += 8;
+ bPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = cos(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX2 for aligned */
#include <smmintrin.h>
static inline void
- volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int quarterPoints = num_points / 4;
- unsigned int i = 0;
-
- __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
- __m128 sine, cosine;
- __m128i q, ones, twos, fours;
-
- m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125);
- pio4A = _mm_set1_ps(0.7853981554508209228515625);
- pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8);
- pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
- ffours = _mm_set1_ps(4.0);
- ftwos = _mm_set1_ps(2.0);
- fones = _mm_set1_ps(1.0);
- fzeroes = _mm_setzero_ps();
- __m128i zeroes = _mm_set1_epi32(0);
- ones = _mm_set1_epi32(1);
- __m128i allones = _mm_set1_epi32(0xffffffff);
- twos = _mm_set1_epi32(2);
- fours = _mm_set1_epi32(4);
-
- cp1 = _mm_set1_ps(1.0);
- cp2 = _mm_set1_ps(0.08333333333333333);
- cp3 = _mm_set1_ps(0.002777777777777778);
- cp4 = _mm_set1_ps(4.96031746031746e-05);
- cp5 = _mm_set1_ps(5.511463844797178e-07);
- union bit128 condition1;
- union bit128 condition3;
-
- for(;number < quarterPoints; number++){
-
- aVal = _mm_load_ps(aPtr);
- // s = fabs(aVal)
- s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
- // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
- q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
- // r = q + q&1, q indicates quadrant, r gives
- r = _mm_cvtepi32_ps(_mm_add_epi32(q, _mm_and_si128(q, ones)));
-
- s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A));
- s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B));
- s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C));
-
- s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
- s = _mm_mul_ps(s, s);
- // Evaluate Taylor series
- s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
- for(i = 0; i < 3; i++)
- s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
- s = _mm_div_ps(s, ftwos);
-
- sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
- cosine = _mm_sub_ps(fones, s);
-
- // if(((q+1)&2) != 0) { cosine=sine;}
- condition1.int_vec = _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes);
- condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec);
-
- // if(((q+2)&4) != 0) { cosine = -cosine;}
- condition3.int_vec = _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes);
- condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec);
-
- cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec));
- cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3.float_vec));
- _mm_store_ps(bPtr, cosine);
- aPtr += 4;
- bPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = cosf(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ unsigned int i = 0;
+
+ __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
+ fones, fzeroes;
+ __m128 sine, cosine;
+ __m128i q, ones, twos, fours;
+
+ m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125);
+ pio4A = _mm_set1_ps(0.7853981554508209228515625);
+ pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8);
+ pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+ ffours = _mm_set1_ps(4.0);
+ ftwos = _mm_set1_ps(2.0);
+ fones = _mm_set1_ps(1.0);
+ fzeroes = _mm_setzero_ps();
+ __m128i zeroes = _mm_set1_epi32(0);
+ ones = _mm_set1_epi32(1);
+ __m128i allones = _mm_set1_epi32(0xffffffff);
+ twos = _mm_set1_epi32(2);
+ fours = _mm_set1_epi32(4);
+
+ cp1 = _mm_set1_ps(1.0);
+ cp2 = _mm_set1_ps(0.08333333333333333);
+ cp3 = _mm_set1_ps(0.002777777777777778);
+ cp4 = _mm_set1_ps(4.96031746031746e-05);
+ cp5 = _mm_set1_ps(5.511463844797178e-07);
+ union bit128 condition1;
+ union bit128 condition3;
+
+ for (; number < quarterPoints; number++) {
+
+ aVal = _mm_load_ps(aPtr);
+ // s = fabs(aVal)
+ s = _mm_sub_ps(aVal,
+ _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+ // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+ q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+ // r = q + q&1, q indicates quadrant, r gives
+ r = _mm_cvtepi32_ps(_mm_add_epi32(q, _mm_and_si128(q, ones)));
+
+ s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A));
+ s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B));
+ s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C));
+
+ s = _mm_div_ps(
+ s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+ s = _mm_mul_ps(s, s);
+ // Evaluate Taylor series
+ s = _mm_mul_ps(
+ _mm_add_ps(
+ _mm_mul_ps(
+ _mm_sub_ps(
+ _mm_mul_ps(
+ _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
+ cp3),
+ s),
+ cp2),
+ s),
+ cp1),
+ s);
+
+ for (i = 0; i < 3; i++)
+ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+ s = _mm_div_ps(s, ftwos);
+
+ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+ cosine = _mm_sub_ps(fones, s);
+
+ // if(((q+1)&2) != 0) { cosine=sine;}
+ condition1.int_vec =
+ _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes);
+ condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec);
+
+ // if(((q+2)&4) != 0) { cosine = -cosine;}
+ condition3.int_vec =
+ _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes);
+ condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec);
+
+ cosine = _mm_add_ps(cosine,
+ _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec));
+ cosine = _mm_sub_ps(
+ cosine,
+ _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3.float_vec));
+ _mm_store_ps(bPtr, cosine);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *bPtr++ = cosf(*aPtr++);
+ }
}
#endif /* LV_HAVE_SSE4_1 for aligned */
#endif /* INCLUDED_volk_32f_cos_32f_a_H */
-
#ifndef INCLUDED_volk_32f_cos_32f_u_H
#define INCLUDED_volk_32f_cos_32f_u_H
#include <immintrin.h>
static inline void
- volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int eighthPoints = num_points / 8;
- unsigned int i = 0;
-
- __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
- __m256 sine, cosine;
- __m256i q, ones, twos, fours;
-
- m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
- pio4A = _mm256_set1_ps(0.7853981554508209228515625);
- pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
- pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
- ffours = _mm256_set1_ps(4.0);
- ftwos = _mm256_set1_ps(2.0);
- fones = _mm256_set1_ps(1.0);
- fzeroes = _mm256_setzero_ps();
- __m256i zeroes = _mm256_set1_epi32(0);
- ones = _mm256_set1_epi32(1);
- __m256i allones = _mm256_set1_epi32(0xffffffff);
- twos = _mm256_set1_epi32(2);
- fours = _mm256_set1_epi32(4);
-
- cp1 = _mm256_set1_ps(1.0);
- cp2 = _mm256_set1_ps(0.08333333333333333);
- cp3 = _mm256_set1_ps(0.002777777777777778);
- cp4 = _mm256_set1_ps(4.96031746031746e-05);
- cp5 = _mm256_set1_ps(5.511463844797178e-07);
- union bit256 condition1;
- union bit256 condition3;
-
- for(;number < eighthPoints; number++){
-
- aVal = _mm256_loadu_ps(aPtr);
- // s = fabs(aVal)
- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
- // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
- // r = q + q&1, q indicates quadrant, r gives
- r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
-
- s = _mm256_fnmadd_ps(r,pio4A,s);
- s = _mm256_fnmadd_ps(r,pio4B,s);
- s = _mm256_fnmadd_ps(r,pio4C,s);
-
- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
- s = _mm256_mul_ps(s, s);
- // Evaluate Taylor series
- s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
-
- for(i = 0; i < 3; i++)
- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
- s = _mm256_div_ps(s, ftwos);
-
- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
- cosine = _mm256_sub_ps(fones, s);
-
- // if(((q+1)&2) != 0) { cosine=sine;}
- condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
- condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
-
- // if(((q+2)&4) != 0) { cosine = -cosine;}
- condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
- condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
-
- cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
- cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3.float_vec));
- _mm256_storeu_ps(bPtr, cosine);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = cos(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int eighthPoints = num_points / 8;
+ unsigned int i = 0;
+
+ __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
+ fones, fzeroes;
+ __m256 sine, cosine;
+ __m256i q, ones, twos, fours;
+
+ m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
+ pio4A = _mm256_set1_ps(0.7853981554508209228515625);
+ pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
+ pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+ ffours = _mm256_set1_ps(4.0);
+ ftwos = _mm256_set1_ps(2.0);
+ fones = _mm256_set1_ps(1.0);
+ fzeroes = _mm256_setzero_ps();
+ __m256i zeroes = _mm256_set1_epi32(0);
+ ones = _mm256_set1_epi32(1);
+ __m256i allones = _mm256_set1_epi32(0xffffffff);
+ twos = _mm256_set1_epi32(2);
+ fours = _mm256_set1_epi32(4);
+
+ cp1 = _mm256_set1_ps(1.0);
+ cp2 = _mm256_set1_ps(0.08333333333333333);
+ cp3 = _mm256_set1_ps(0.002777777777777778);
+ cp4 = _mm256_set1_ps(4.96031746031746e-05);
+ cp5 = _mm256_set1_ps(5.511463844797178e-07);
+ union bit256 condition1;
+ union bit256 condition3;
+
+ for (; number < eighthPoints; number++) {
+
+ aVal = _mm256_loadu_ps(aPtr);
+ // s = fabs(aVal)
+ s = _mm256_sub_ps(aVal,
+ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+ // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+ // r = q + q&1, q indicates quadrant, r gives
+ r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
+
+ s = _mm256_fnmadd_ps(r, pio4A, s);
+ s = _mm256_fnmadd_ps(r, pio4B, s);
+ s = _mm256_fnmadd_ps(r, pio4C, s);
+
+ s = _mm256_div_ps(
+ s,
+ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+ s = _mm256_mul_ps(s, s);
+ // Evaluate Taylor series
+ s = _mm256_mul_ps(
+ _mm256_fmadd_ps(
+ _mm256_fmsub_ps(
+ _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
+ s,
+ cp1),
+ s);
+
+ for (i = 0; i < 3; i++)
+ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+ s = _mm256_div_ps(s, ftwos);
+
+ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+ cosine = _mm256_sub_ps(fones, s);
+
+ // if(((q+1)&2) != 0) { cosine=sine;}
+ condition1.int_vec =
+ _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
+ condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
+
+ // if(((q+2)&4) != 0) { cosine = -cosine;}
+ condition3.int_vec = _mm256_cmpeq_epi32(
+ _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
+ condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
+
+ cosine = _mm256_add_ps(
+ cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
+ cosine = _mm256_sub_ps(cosine,
+ _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
+ condition3.float_vec));
+ _mm256_storeu_ps(bPtr, cosine);
+ aPtr += 8;
+ bPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = cos(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
#include <immintrin.h>
static inline void
- volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
+volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int eighthPoints = num_points / 8;
- unsigned int i = 0;
-
- __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
- __m256 sine, cosine;
- __m256i q, ones, twos, fours;
-
- m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
- pio4A = _mm256_set1_ps(0.7853981554508209228515625);
- pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
- pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
- ffours = _mm256_set1_ps(4.0);
- ftwos = _mm256_set1_ps(2.0);
- fones = _mm256_set1_ps(1.0);
- fzeroes = _mm256_setzero_ps();
- __m256i zeroes = _mm256_set1_epi32(0);
- ones = _mm256_set1_epi32(1);
- __m256i allones = _mm256_set1_epi32(0xffffffff);
- twos = _mm256_set1_epi32(2);
- fours = _mm256_set1_epi32(4);
-
- cp1 = _mm256_set1_ps(1.0);
- cp2 = _mm256_set1_ps(0.08333333333333333);
- cp3 = _mm256_set1_ps(0.002777777777777778);
- cp4 = _mm256_set1_ps(4.96031746031746e-05);
- cp5 = _mm256_set1_ps(5.511463844797178e-07);
- union bit256 condition1;
- union bit256 condition3;
-
- for(;number < eighthPoints; number++){
-
- aVal = _mm256_loadu_ps(aPtr);
- // s = fabs(aVal)
- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
- // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
- // r = q + q&1, q indicates quadrant, r gives
- r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
-
- s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4A));
- s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4B));
- s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4C));
-
- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
- s = _mm256_mul_ps(s, s);
- // Evaluate Taylor series
- s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
- for(i = 0; i < 3; i++)
- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
- s = _mm256_div_ps(s, ftwos);
-
- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
- cosine = _mm256_sub_ps(fones, s);
-
- // if(((q+1)&2) != 0) { cosine=sine;}
- condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
- condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
-
- // if(((q+2)&4) != 0) { cosine = -cosine;}
- condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
- condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
-
- cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
- cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3.float_vec));
- _mm256_storeu_ps(bPtr, cosine);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = cos(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int eighthPoints = num_points / 8;
+ unsigned int i = 0;
+
+ __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
+ fones, fzeroes;
+ __m256 sine, cosine;
+ __m256i q, ones, twos, fours;
+
+ m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
+ pio4A = _mm256_set1_ps(0.7853981554508209228515625);
+ pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
+ pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+ ffours = _mm256_set1_ps(4.0);
+ ftwos = _mm256_set1_ps(2.0);
+ fones = _mm256_set1_ps(1.0);
+ fzeroes = _mm256_setzero_ps();
+ __m256i zeroes = _mm256_set1_epi32(0);
+ ones = _mm256_set1_epi32(1);
+ __m256i allones = _mm256_set1_epi32(0xffffffff);
+ twos = _mm256_set1_epi32(2);
+ fours = _mm256_set1_epi32(4);
+
+ cp1 = _mm256_set1_ps(1.0);
+ cp2 = _mm256_set1_ps(0.08333333333333333);
+ cp3 = _mm256_set1_ps(0.002777777777777778);
+ cp4 = _mm256_set1_ps(4.96031746031746e-05);
+ cp5 = _mm256_set1_ps(5.511463844797178e-07);
+ union bit256 condition1;
+ union bit256 condition3;
+
+ for (; number < eighthPoints; number++) {
+
+ aVal = _mm256_loadu_ps(aPtr);
+ // s = fabs(aVal)
+ s = _mm256_sub_ps(aVal,
+ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+ // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+ // r = q + q&1, q indicates quadrant, r gives
+ r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
+
+ s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
+ s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
+ s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
+
+ s = _mm256_div_ps(
+ s,
+ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+ s = _mm256_mul_ps(s, s);
+ // Evaluate Taylor series
+ s = _mm256_mul_ps(
+ _mm256_add_ps(
+ _mm256_mul_ps(
+ _mm256_sub_ps(
+ _mm256_mul_ps(
+ _mm256_add_ps(
+ _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
+ s),
+ cp3),
+ s),
+ cp2),
+ s),
+ cp1),
+ s);
+
+ for (i = 0; i < 3; i++)
+ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+ s = _mm256_div_ps(s, ftwos);
+
+ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+ cosine = _mm256_sub_ps(fones, s);
+
+ // if(((q+1)&2) != 0) { cosine=sine;}
+ condition1.int_vec =
+ _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
+ condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
+
+ // if(((q+2)&4) != 0) { cosine = -cosine;}
+ condition3.int_vec = _mm256_cmpeq_epi32(
+ _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
+ condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
+
+ cosine = _mm256_add_ps(
+ cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
+ cosine = _mm256_sub_ps(cosine,
+ _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
+ condition3.float_vec));
+ _mm256_storeu_ps(bPtr, cosine);
+ aPtr += 8;
+ bPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = cos(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX2 for unaligned */
static inline void
volk_32f_cos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int quarterPoints = num_points / 4;
- unsigned int i = 0;
-
- __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
- __m128 sine, cosine, condition1, condition3;
- __m128i q, r, ones, twos, fours;
-
- m4pi = _mm_set1_ps(1.273239545);
- pio4A = _mm_set1_ps(0.78515625);
- pio4B = _mm_set1_ps(0.241876e-3);
- ffours = _mm_set1_ps(4.0);
- ftwos = _mm_set1_ps(2.0);
- fones = _mm_set1_ps(1.0);
- fzeroes = _mm_setzero_ps();
- ones = _mm_set1_epi32(1);
- twos = _mm_set1_epi32(2);
- fours = _mm_set1_epi32(4);
-
- cp1 = _mm_set1_ps(1.0);
- cp2 = _mm_set1_ps(0.83333333e-1);
- cp3 = _mm_set1_ps(0.2777778e-2);
- cp4 = _mm_set1_ps(0.49603e-4);
- cp5 = _mm_set1_ps(0.551e-6);
-
- for(;number < quarterPoints; number++){
- aVal = _mm_loadu_ps(aPtr);
- s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
- q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
- r = _mm_add_epi32(q, _mm_and_si128(q, ones));
-
- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
-
- s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
- s = _mm_mul_ps(s, s);
- // Evaluate Taylor series
- s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
- for(i = 0; i < 3; i++){
- s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
- }
- s = _mm_div_ps(s, ftwos);
-
- sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
- cosine = _mm_sub_ps(fones, s);
-
- condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
- condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
-
- cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
- cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
- _mm_storeu_ps(bPtr, cosine);
- aPtr += 4;
- bPtr += 4;
- }
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ unsigned int i = 0;
+
+ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+ fzeroes;
+ __m128 sine, cosine, condition1, condition3;
+ __m128i q, r, ones, twos, fours;
+
+ m4pi = _mm_set1_ps(1.273239545);
+ pio4A = _mm_set1_ps(0.78515625);
+ pio4B = _mm_set1_ps(0.241876e-3);
+ ffours = _mm_set1_ps(4.0);
+ ftwos = _mm_set1_ps(2.0);
+ fones = _mm_set1_ps(1.0);
+ fzeroes = _mm_setzero_ps();
+ ones = _mm_set1_epi32(1);
+ twos = _mm_set1_epi32(2);
+ fours = _mm_set1_epi32(4);
+
+ cp1 = _mm_set1_ps(1.0);
+ cp2 = _mm_set1_ps(0.83333333e-1);
+ cp3 = _mm_set1_ps(0.2777778e-2);
+ cp4 = _mm_set1_ps(0.49603e-4);
+ cp5 = _mm_set1_ps(0.551e-6);
+
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_loadu_ps(aPtr);
+ s = _mm_sub_ps(aVal,
+ _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+ q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+ r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+ s = _mm_div_ps(
+ s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+ s = _mm_mul_ps(s, s);
+ // Evaluate Taylor series
+ s = _mm_mul_ps(
+ _mm_add_ps(
+ _mm_mul_ps(
+ _mm_sub_ps(
+ _mm_mul_ps(
+ _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
+ cp3),
+ s),
+ cp2),
+ s),
+ cp1),
+ s);
+
+ for (i = 0; i < 3; i++) {
+ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+ }
+ s = _mm_div_ps(s, ftwos);
+
+ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+ cosine = _mm_sub_ps(fones, s);
+
+ condition1 = _mm_cmpneq_ps(
+ _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+
+ condition3 = _mm_cmpneq_ps(
+ _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
+
+ cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
+ cosine = _mm_sub_ps(
+ cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
+ _mm_storeu_ps(bPtr, cosine);
+ aPtr += 4;
+ bPtr += 4;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = cosf(*aPtr++);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *bPtr++ = cosf(*aPtr++);
+ }
}
#endif /* LV_HAVE_SSE4_1 for unaligned */
* Shibata, Naoki, "Efficient evaluation methods of elementary functions
* suitable for SIMD computation," in Springer-Verlag 2010
*/
-static inline void
-volk_32f_cos_32f_generic_fast(float* bVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_cos_32f_generic_fast(float* bVector,
+ const float* aVector,
+ unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- float m4pi = 1.273239544735162542821171882678754627704620361328125;
- float pio4A = 0.7853981554508209228515625;
- float pio4B = 0.794662735614792836713604629039764404296875e-8;
- float pio4C = 0.306161699786838294306516483068750264552437361480769e-16;
- int N = 3; // order of argument reduction
-
- unsigned int number;
- for(number = 0; number < num_points; number++){
- float s = fabs(*aPtr);
- int q = (int)(s * m4pi);
- int r = q + (q&1);
- s -= r * pio4A;
- s -= r * pio4B;
- s -= r * pio4C;
-
- s = s * 0.125; // 2^-N (<--3)
- s = s*s;
- s = ((((s/1814400. - 1.0/20160.0)*s + 1.0/360.0)*s - 1.0/12.0)*s + 1.0)*s;
-
- int i;
- for(i=0; i < N; ++i) {
- s = (4.0-s)*s;
- }
- s = s/2.0;
-
- float sine = sqrt((2.0-s)*s);
- float cosine = 1-s;
-
- if (((q+1) & 2) != 0) {
- s = cosine;
- cosine = sine;
- sine = s;
- }
- if (((q+2) & 4) != 0) {
- cosine = -cosine;
- }
- *bPtr = cosine;
- bPtr++;
- aPtr++;
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ float m4pi = 1.273239544735162542821171882678754627704620361328125;
+ float pio4A = 0.7853981554508209228515625;
+ float pio4B = 0.794662735614792836713604629039764404296875e-8;
+ float pio4C = 0.306161699786838294306516483068750264552437361480769e-16;
+ int N = 3; // order of argument reduction
+
+ unsigned int number;
+ for (number = 0; number < num_points; number++) {
+ float s = fabs(*aPtr);
+ int q = (int)(s * m4pi);
+ int r = q + (q & 1);
+ s -= r * pio4A;
+ s -= r * pio4B;
+ s -= r * pio4C;
+
+ s = s * 0.125; // 2^-N (<--3)
+ s = s * s;
+ s = ((((s / 1814400. - 1.0 / 20160.0) * s + 1.0 / 360.0) * s - 1.0 / 12.0) * s +
+ 1.0) *
+ s;
+
+ int i;
+ for (i = 0; i < N; ++i) {
+ s = (4.0 - s) * s;
+ }
+ s = s / 2.0;
+
+ float sine = sqrt((2.0 - s) * s);
+ float cosine = 1 - s;
+
+ if (((q + 1) & 2) != 0) {
+ s = cosine;
+ cosine = sine;
+ sine = s;
+ }
+ if (((q + 2) & 4) != 0) {
+ cosine = -cosine;
+ }
+ *bPtr = cosine;
+ bPtr++;
+ aPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
static inline void
volk_32f_cos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
- for(; number < num_points; number++){
- *bPtr++ = cosf(*aPtr++);
- }
+ for (; number < num_points; number++) {
+ *bPtr++ = cosf(*aPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#include <volk/volk_neon_intrinsics.h>
static inline void
-volk_32f_cos_32f_neon(float* bVector, const float* aVector,
- unsigned int num_points)
+volk_32f_cos_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
{
unsigned int number = 0;
unsigned int quarter_points = num_points / 4;
float* bVectorPtr = bVector;
const float* aVectorPtr = aVector;
-
+
float32x4_t b_vec;
float32x4_t a_vec;
-
- for(number = 0; number < quarter_points; number++) {
+
+ for (number = 0; number < quarter_points; number++) {
a_vec = vld1q_f32(aVectorPtr);
// Prefetch next one, speeds things up
- __VOLK_PREFETCH(aVectorPtr+4);
+ __VOLK_PREFETCH(aVectorPtr + 4);
b_vec = _vcosq_f32(a_vec);
vst1q_f32(bVectorPtr, b_vec);
// move pointers ahead
- bVectorPtr+=4;
- aVectorPtr+=4;
+ bVectorPtr += 4;
+ aVectorPtr += 4;
}
-
+
// Deal with the rest
- for(number = quarter_points * 4; number < num_points; number++) {
+ for (number = quarter_points * 4; number < num_points; number++) {
*bVectorPtr++ = cosf(*aVectorPtr++);
}
}
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_expfast_32f(float* bVector, const float* aVector, unsigned int num_points)
- * \endcode
+ * void volk_32f_expfast_32f(float* bVector, const float* aVector, unsigned int
+ * num_points) \endcode
*
* \b Inputs
* \li aVector: Input vector of floats.
* \endcode
*/
-#include <stdio.h>
-#include <math.h>
#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
#define Mln2 0.6931471805f
#define A 8388608.0f
#include <immintrin.h>
-static inline void
- volk_32f_expfast_32f_a_avx_fma(float* bVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_expfast_32f_a_avx_fma(float* bVector,
+ const float* aVector,
+ unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- __m256 aVal, bVal, a, b;
- __m256i exp;
- a = _mm256_set1_ps(A/Mln2);
- b = _mm256_set1_ps(B-C);
-
- for(;number < eighthPoints; number++){
- aVal = _mm256_load_ps(aPtr);
- exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a,aVal, b));
- bVal = _mm256_castsi256_ps(exp);
-
- _mm256_store_ps(bPtr, bVal);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = expf(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ __m256 aVal, bVal, a, b;
+ __m256i exp;
+ a = _mm256_set1_ps(A / Mln2);
+ b = _mm256_set1_ps(B - C);
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_load_ps(aPtr);
+ exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
+ bVal = _mm256_castsi256_ps(exp);
+
+ _mm256_store_ps(bPtr, bVal);
+ aPtr += 8;
+ bPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = expf(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned */
#include <immintrin.h>
static inline void
- volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
+volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- __m256 aVal, bVal, a, b;
- __m256i exp;
- a = _mm256_set1_ps(A/Mln2);
- b = _mm256_set1_ps(B-C);
-
- for(;number < eighthPoints; number++){
- aVal = _mm256_load_ps(aPtr);
- exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b));
- bVal = _mm256_castsi256_ps(exp);
-
- _mm256_store_ps(bPtr, bVal);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = expf(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ __m256 aVal, bVal, a, b;
+ __m256i exp;
+ a = _mm256_set1_ps(A / Mln2);
+ b = _mm256_set1_ps(B - C);
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_load_ps(aPtr);
+ exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
+ bVal = _mm256_castsi256_ps(exp);
+
+ _mm256_store_ps(bPtr, bVal);
+ aPtr += 8;
+ bPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = expf(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX for aligned */
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
-static inline void
-volk_32f_expfast_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_expfast_32f_a_sse4_1(float* bVector,
+ const float* aVector,
+ unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- __m128 aVal, bVal, a, b;
- __m128i exp;
- a = _mm_set1_ps(A/Mln2);
- b = _mm_set1_ps(B-C);
-
- for(;number < quarterPoints; number++){
- aVal = _mm_load_ps(aPtr);
- exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b));
- bVal = _mm_castsi128_ps(exp);
-
- _mm_store_ps(bPtr, bVal);
- aPtr += 4;
- bPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = expf(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 aVal, bVal, a, b;
+ __m128i exp;
+ a = _mm_set1_ps(A / Mln2);
+ b = _mm_set1_ps(B - C);
+
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr);
+ exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
+ bVal = _mm_castsi128_ps(exp);
+
+ _mm_store_ps(bPtr, bVal);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *bPtr++ = expf(*aPtr++);
+ }
}
#endif /* LV_HAVE_SSE4_1 for aligned */
#if LV_HAVE_AVX && LV_HAVE_FMA
#include <immintrin.h>
-static inline void
-volk_32f_expfast_32f_u_avx_fma(float* bVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_expfast_32f_u_avx_fma(float* bVector,
+ const float* aVector,
+ unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- __m256 aVal, bVal, a, b;
- __m256i exp;
- a = _mm256_set1_ps(A/Mln2);
- b = _mm256_set1_ps(B-C);
-
- for(;number < eighthPoints; number++){
- aVal = _mm256_loadu_ps(aPtr);
- exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a,aVal, b));
- bVal = _mm256_castsi256_ps(exp);
-
- _mm256_storeu_ps(bPtr, bVal);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = expf(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ __m256 aVal, bVal, a, b;
+ __m256i exp;
+ a = _mm256_set1_ps(A / Mln2);
+ b = _mm256_set1_ps(B - C);
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_loadu_ps(aPtr);
+ exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
+ bVal = _mm256_castsi256_ps(exp);
+
+ _mm256_storeu_ps(bPtr, bVal);
+ aPtr += 8;
+ bPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = expf(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX && LV_HAVE_FMA for unaligned */
static inline void
volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- __m256 aVal, bVal, a, b;
- __m256i exp;
- a = _mm256_set1_ps(A/Mln2);
- b = _mm256_set1_ps(B-C);
-
- for(;number < eighthPoints; number++){
- aVal = _mm256_loadu_ps(aPtr);
- exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b));
- bVal = _mm256_castsi256_ps(exp);
-
- _mm256_storeu_ps(bPtr, bVal);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = expf(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ __m256 aVal, bVal, a, b;
+ __m256i exp;
+ a = _mm256_set1_ps(A / Mln2);
+ b = _mm256_set1_ps(B - C);
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_loadu_ps(aPtr);
+ exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
+ bVal = _mm256_castsi256_ps(exp);
+
+ _mm256_storeu_ps(bPtr, bVal);
+ aPtr += 8;
+ bPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = expf(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX for unaligned */
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
-static inline void
-volk_32f_expfast_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_expfast_32f_u_sse4_1(float* bVector,
+ const float* aVector,
+ unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- __m128 aVal, bVal, a, b;
- __m128i exp;
- a = _mm_set1_ps(A/Mln2);
- b = _mm_set1_ps(B-C);
-
- for(;number < quarterPoints; number++){
- aVal = _mm_loadu_ps(aPtr);
- exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b));
- bVal = _mm_castsi128_ps(exp);
-
- _mm_storeu_ps(bPtr, bVal);
- aPtr += 4;
- bPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = expf(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 aVal, bVal, a, b;
+ __m128i exp;
+ a = _mm_set1_ps(A / Mln2);
+ b = _mm_set1_ps(B - C);
+
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_loadu_ps(aPtr);
+ exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
+ bVal = _mm_castsi128_ps(exp);
+
+ _mm_storeu_ps(bPtr, bVal);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *bPtr++ = expf(*aPtr++);
+ }
}
#endif /* LV_HAVE_SSE4_1 for unaligned */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_expfast_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_expfast_32f_generic(float* bVector,
+ const float* aVector,
+ unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- *bPtr++ = expf(*aPtr++);
- }
+ for (number = 0; number < num_points; number++) {
+ *bPtr++ = expf(*aPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifndef INCLUDED_volk_32f_index_max_16u_a_H
#define INCLUDED_volk_32f_index_max_16u_a_H
-#include <volk/volk_common.h>
-#include <volk/volk_common.h>
#include <inttypes.h>
#include <limits.h>
#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
static inline void
-volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0,
- uint32_t num_points)
+volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_points)
{
- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
-
- uint32_t number = 0;
- const uint32_t eighthPoints = num_points / 8;
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
- float* inputPtr = (float*)src0;
+ uint32_t number = 0;
+ const uint32_t eighthPoints = num_points / 8;
- __m256 indexIncrementValues = _mm256_set1_ps(8);
- __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
+ float* inputPtr = (float*)src0;
- float max = src0[0];
- float index = 0;
- __m256 maxValues = _mm256_set1_ps(max);
- __m256 maxValuesIndex = _mm256_setzero_ps();
- __m256 compareResults;
- __m256 currentValues;
+ __m256 indexIncrementValues = _mm256_set1_ps(8);
+ __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
- __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
- __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
+ float max = src0[0];
+ float index = 0;
+ __m256 maxValues = _mm256_set1_ps(max);
+ __m256 maxValuesIndex = _mm256_setzero_ps();
+ __m256 compareResults;
+ __m256 currentValues;
- for(;number < eighthPoints; number++){
+ __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
+ __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
- currentValues = _mm256_load_ps(inputPtr); inputPtr += 8;
- currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+ for (; number < eighthPoints; number++) {
- compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
+ currentValues = _mm256_load_ps(inputPtr);
+ inputPtr += 8;
+ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
- maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
- maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
- }
+ compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
- // Calculate the largest value from the remaining 4 points
- _mm256_store_ps(maxValuesBuffer, maxValues);
- _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
+ maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+ maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
+ }
- for(number = 0; number < 8; number++){
- if(maxValuesBuffer[number] > max){
- index = maxIndexesBuffer[number];
- max = maxValuesBuffer[number];
- } else if(maxValuesBuffer[number] == max){
- if (index > maxIndexesBuffer[number])
- index = maxIndexesBuffer[number];
+ // Calculate the largest value from the remaining 4 points
+ _mm256_store_ps(maxValuesBuffer, maxValues);
+ _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for (number = 0; number < 8; number++) {
+ if (maxValuesBuffer[number] > max) {
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ } else if (maxValuesBuffer[number] == max) {
+ if (index > maxIndexesBuffer[number])
+ index = maxIndexesBuffer[number];
+ }
}
- }
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- if(src0[number] > max){
- index = number;
- max = src0[number];
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ if (src0[number] > max) {
+ index = number;
+ max = src0[number];
+ }
}
- }
- target[0] = (uint16_t)index;
+ target[0] = (uint16_t)index;
}
#endif /*LV_HAVE_AVX*/
#include <smmintrin.h>
static inline void
-volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0,
- uint32_t num_points)
+volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t num_points)
{
- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
-
- uint32_t number = 0;
- const uint32_t quarterPoints = num_points / 4;
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
- float* inputPtr = (float*)src0;
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 4;
- __m128 indexIncrementValues = _mm_set1_ps(4);
- __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+ float* inputPtr = (float*)src0;
- float max = src0[0];
- float index = 0;
- __m128 maxValues = _mm_set1_ps(max);
- __m128 maxValuesIndex = _mm_setzero_ps();
- __m128 compareResults;
- __m128 currentValues;
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+ float max = src0[0];
+ float index = 0;
+ __m128 maxValues = _mm_set1_ps(max);
+ __m128 maxValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
- for(;number < quarterPoints; number++){
+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
- currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+ for (; number < quarterPoints; number++) {
- compareResults = _mm_cmpgt_ps(currentValues, maxValues);
+ currentValues = _mm_load_ps(inputPtr);
+ inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
- maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
- maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
- }
+ compareResults = _mm_cmpgt_ps(currentValues, maxValues);
- // Calculate the largest value from the remaining 4 points
- _mm_store_ps(maxValuesBuffer, maxValues);
- _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+ maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+ maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
+ }
- for(number = 0; number < 4; number++){
- if(maxValuesBuffer[number] > max){
- index = maxIndexesBuffer[number];
- max = maxValuesBuffer[number];
- } else if(maxValuesBuffer[number] == max){
- if (index > maxIndexesBuffer[number])
- index = maxIndexesBuffer[number];
+ // Calculate the largest value from the remaining 4 points
+ _mm_store_ps(maxValuesBuffer, maxValues);
+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for (number = 0; number < 4; number++) {
+ if (maxValuesBuffer[number] > max) {
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ } else if (maxValuesBuffer[number] == max) {
+ if (index > maxIndexesBuffer[number])
+ index = maxIndexesBuffer[number];
+ }
}
- }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- if(src0[number] > max){
- index = number;
- max = src0[number];
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ if (src0[number] > max) {
+ index = number;
+ max = src0[number];
+ }
}
- }
- target[0] = (uint16_t)index;
+ target[0] = (uint16_t)index;
}
#endif /*LV_HAVE_SSE4_1*/
#include <xmmintrin.h>
static inline void
-volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0,
- uint32_t num_points)
+volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_points)
{
- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
-
- uint32_t number = 0;
- const uint32_t quarterPoints = num_points / 4;
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
- float* inputPtr = (float*)src0;
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 4;
- __m128 indexIncrementValues = _mm_set1_ps(4);
- __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+ float* inputPtr = (float*)src0;
- float max = src0[0];
- float index = 0;
- __m128 maxValues = _mm_set1_ps(max);
- __m128 maxValuesIndex = _mm_setzero_ps();
- __m128 compareResults;
- __m128 currentValues;
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+ float max = src0[0];
+ float index = 0;
+ __m128 maxValues = _mm_set1_ps(max);
+ __m128 maxValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
- for(;number < quarterPoints; number++){
+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
- currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+ for (; number < quarterPoints; number++) {
- compareResults = _mm_cmpgt_ps(currentValues, maxValues);
+ currentValues = _mm_load_ps(inputPtr);
+ inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
- maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
- _mm_andnot_ps(compareResults, maxValuesIndex));
- maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
- _mm_andnot_ps(compareResults, maxValues));
- }
+ compareResults = _mm_cmpgt_ps(currentValues, maxValues);
- // Calculate the largest value from the remaining 4 points
- _mm_store_ps(maxValuesBuffer, maxValues);
- _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
+ _mm_andnot_ps(compareResults, maxValuesIndex));
+ maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
+ _mm_andnot_ps(compareResults, maxValues));
+ }
- for(number = 0; number < 4; number++){
- if(maxValuesBuffer[number] > max){
- index = maxIndexesBuffer[number];
- max = maxValuesBuffer[number];
- } else if(maxValuesBuffer[number] == max){
- if (index > maxIndexesBuffer[number])
- index = maxIndexesBuffer[number];
+ // Calculate the largest value from the remaining 4 points
+ _mm_store_ps(maxValuesBuffer, maxValues);
+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for (number = 0; number < 4; number++) {
+ if (maxValuesBuffer[number] > max) {
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ } else if (maxValuesBuffer[number] == max) {
+ if (index > maxIndexesBuffer[number])
+ index = maxIndexesBuffer[number];
+ }
}
- }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- if(src0[number] > max){
- index = number;
- max = src0[number];
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ if (src0[number] > max) {
+ index = number;
+ max = src0[number];
+ }
}
- }
- target[0] = (uint16_t)index;
+ target[0] = (uint16_t)index;
}
#endif /*LV_HAVE_SSE*/
#ifdef LV_HAVE_GENERIC
static inline void
-volk_32f_index_max_16u_generic(uint16_t* target, const float* src0,
- uint32_t num_points)
+volk_32f_index_max_16u_generic(uint16_t* target, const float* src0, uint32_t num_points)
{
- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
- float max = src0[0];
- uint16_t index = 0;
+ float max = src0[0];
+ uint16_t index = 0;
- uint32_t i = 1;
+ uint32_t i = 1;
- for(; i < num_points; ++i) {
- if(src0[i] > max) {
- index = i;
- max = src0[i];
+ for (; i < num_points; ++i) {
+ if (src0[i] > max) {
+ index = i;
+ max = src0[i];
+ }
}
- }
- target[0] = index;
+ target[0] = index;
}
#endif /*LV_HAVE_GENERIC*/
#endif /*INCLUDED_volk_32f_index_max_16u_a_H*/
-
#ifndef INCLUDED_volk_32f_index_max_16u_u_H
#define INCLUDED_volk_32f_index_max_16u_u_H
-#include <volk/volk_common.h>
-#include <volk/volk_common.h>
#include <inttypes.h>
#include <limits.h>
#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
static inline void
-volk_32f_index_max_16u_u_avx(uint16_t* target, const float* src0,
- uint32_t num_points)
+volk_32f_index_max_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_points)
{
- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
-
- uint32_t number = 0;
- const uint32_t eighthPoints = num_points / 8;
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
- float* inputPtr = (float*)src0;
+ uint32_t number = 0;
+ const uint32_t eighthPoints = num_points / 8;
- __m256 indexIncrementValues = _mm256_set1_ps(8);
- __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
+ float* inputPtr = (float*)src0;
- float max = src0[0];
- float index = 0;
- __m256 maxValues = _mm256_set1_ps(max);
- __m256 maxValuesIndex = _mm256_setzero_ps();
- __m256 compareResults;
- __m256 currentValues;
+ __m256 indexIncrementValues = _mm256_set1_ps(8);
+ __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
- __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
- __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
+ float max = src0[0];
+ float index = 0;
+ __m256 maxValues = _mm256_set1_ps(max);
+ __m256 maxValuesIndex = _mm256_setzero_ps();
+ __m256 compareResults;
+ __m256 currentValues;
- for(;number < eighthPoints; number++){
+ __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
+ __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
- currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8;
- currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+ for (; number < eighthPoints; number++) {
- compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
+ currentValues = _mm256_loadu_ps(inputPtr);
+ inputPtr += 8;
+ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
- maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
- maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
- }
+ compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
- // Calculate the largest value from the remaining 4 points
- _mm256_storeu_ps(maxValuesBuffer, maxValues);
- _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex);
+ maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+ maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
+ }
- for(number = 0; number < 8; number++){
- if(maxValuesBuffer[number] > max){
- index = maxIndexesBuffer[number];
- max = maxValuesBuffer[number];
- } else if(maxValuesBuffer[number] == max){
- if (index > maxIndexesBuffer[number])
- index = maxIndexesBuffer[number];
+ // Calculate the largest value from the remaining 4 points
+ _mm256_storeu_ps(maxValuesBuffer, maxValues);
+ _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for (number = 0; number < 8; number++) {
+ if (maxValuesBuffer[number] > max) {
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ } else if (maxValuesBuffer[number] == max) {
+ if (index > maxIndexesBuffer[number])
+ index = maxIndexesBuffer[number];
+ }
}
- }
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- if(src0[number] > max){
- index = number;
- max = src0[number];
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ if (src0[number] > max) {
+ index = number;
+ max = src0[number];
+ }
}
- }
- target[0] = (uint16_t)index;
+ target[0] = (uint16_t)index;
}
#endif /*LV_HAVE_AVX*/
*
* \b Overview
*
- * Returns Argmax_i x[i]. Finds and returns the index which contains the first maximum value in the given vector.
+ * Returns Argmax_i x[i]. Finds and returns the index which contains the first maximum
+ * value in the given vector.
*
* <b>Dispatcher Prototype</b>
* \code
#ifndef INCLUDED_volk_32f_index_max_32u_a_H
#define INCLUDED_volk_32f_index_max_32u_a_H
-#include <volk/volk_common.h>
-#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_SSE4_1
-#include<smmintrin.h>
+#include <smmintrin.h>
static inline void
volk_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
{
- if(num_points > 0){
- uint32_t number = 0;
- const uint32_t quarterPoints = num_points / 4;
+ if (num_points > 0) {
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 4;
- float* inputPtr = (float*)src0;
+ float* inputPtr = (float*)src0;
- __m128 indexIncrementValues = _mm_set1_ps(4);
- __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
- float max = src0[0];
- float index = 0;
- __m128 maxValues = _mm_set1_ps(max);
- __m128 maxValuesIndex = _mm_setzero_ps();
- __m128 compareResults;
- __m128 currentValues;
+ float max = src0[0];
+ float index = 0;
+ __m128 maxValues = _mm_set1_ps(max);
+ __m128 maxValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
- for(;number < quarterPoints; number++){
+ for (; number < quarterPoints; number++) {
- currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+ currentValues = _mm_load_ps(inputPtr);
+ inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
- compareResults = _mm_cmpgt_ps(currentValues, maxValues);
+ compareResults = _mm_cmpgt_ps(currentValues, maxValues);
- maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
- maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
- }
+ maxValuesIndex =
+ _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+ maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
+ }
- // Calculate the largest value from the remaining 4 points
- _mm_store_ps(maxValuesBuffer, maxValues);
- _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
-
- for(number = 0; number < 4; number++){
- if(maxValuesBuffer[number] > max){
- index = maxIndexesBuffer[number];
- max = maxValuesBuffer[number];
- } else if(maxValuesBuffer[number] == max){
- if (index > maxIndexesBuffer[number])
- index = maxIndexesBuffer[number];
- }
- }
+ // Calculate the largest value from the remaining 4 points
+ _mm_store_ps(maxValuesBuffer, maxValues);
+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for (number = 0; number < 4; number++) {
+ if (maxValuesBuffer[number] > max) {
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ } else if (maxValuesBuffer[number] == max) {
+ if (index > maxIndexesBuffer[number])
+ index = maxIndexesBuffer[number];
+ }
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- if(src0[number] > max){
- index = number;
- max = src0[number];
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ if (src0[number] > max) {
+ index = number;
+ max = src0[number];
+ }
+ }
+ target[0] = (uint32_t)index;
}
- target[0] = (uint32_t)index;
- }
}
#endif /*LV_HAVE_SSE4_1*/
#ifdef LV_HAVE_SSE
-#include<xmmintrin.h>
+#include <xmmintrin.h>
static inline void
volk_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points)
{
- if(num_points > 0){
- uint32_t number = 0;
- const uint32_t quarterPoints = num_points / 4;
+ if (num_points > 0) {
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 4;
- float* inputPtr = (float*)src0;
+ float* inputPtr = (float*)src0;
- __m128 indexIncrementValues = _mm_set1_ps(4);
- __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
- float max = src0[0];
- float index = 0;
- __m128 maxValues = _mm_set1_ps(max);
- __m128 maxValuesIndex = _mm_setzero_ps();
- __m128 compareResults;
- __m128 currentValues;
+ float max = src0[0];
+ float index = 0;
+ __m128 maxValues = _mm_set1_ps(max);
+ __m128 maxValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
- for(;number < quarterPoints; number++){
+ for (; number < quarterPoints; number++) {
- currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+ currentValues = _mm_load_ps(inputPtr);
+ inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
- compareResults = _mm_cmpgt_ps(currentValues, maxValues);
+ compareResults = _mm_cmpgt_ps(currentValues, maxValues);
- maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
- _mm_andnot_ps(compareResults, maxValuesIndex));
+ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
+ _mm_andnot_ps(compareResults, maxValuesIndex));
- maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
- _mm_andnot_ps(compareResults, maxValues));
- }
+ maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
+ _mm_andnot_ps(compareResults, maxValues));
+ }
- // Calculate the largest value from the remaining 4 points
- _mm_store_ps(maxValuesBuffer, maxValues);
- _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
-
- for(number = 0; number < 4; number++){
- if(maxValuesBuffer[number] > max){
- index = maxIndexesBuffer[number];
- max = maxValuesBuffer[number];
- } else if(maxValuesBuffer[number] == max){
- if (index > maxIndexesBuffer[number])
- index = maxIndexesBuffer[number];
- }
- }
+ // Calculate the largest value from the remaining 4 points
+ _mm_store_ps(maxValuesBuffer, maxValues);
+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for (number = 0; number < 4; number++) {
+ if (maxValuesBuffer[number] > max) {
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ } else if (maxValuesBuffer[number] == max) {
+ if (index > maxIndexesBuffer[number])
+ index = maxIndexesBuffer[number];
+ }
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- if(src0[number] > max){
- index = number;
- max = src0[number];
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ if (src0[number] > max) {
+ index = number;
+ max = src0[number];
+ }
+ }
+ target[0] = (uint32_t)index;
}
- target[0] = (uint32_t)index;
- }
}
#endif /*LV_HAVE_SSE*/
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void volk_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points)
+static inline void
+volk_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points)
{
- if(num_points > 0)
- {
- uint32_t number = 0;
- const uint32_t quarterPoints = num_points / 8;
-
- float* inputPtr = (float*)src0;
-
- __m256 indexIncrementValues = _mm256_set1_ps(8);
- __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
-
- float max = src0[0];
- float index = 0;
- __m256 maxValues = _mm256_set1_ps(max);
- __m256 maxValuesIndex = _mm256_setzero_ps();
- __m256 compareResults;
- __m256 currentValues;
-
- __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
- __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
-
- for(;number < quarterPoints; number++)
- {
- currentValues = _mm256_load_ps(inputPtr); inputPtr += 8;
- currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
- compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
- maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
- maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
- }
-
- // Calculate the largest value from the remaining 8 points
- _mm256_store_ps(maxValuesBuffer, maxValues);
- _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
-
- for(number = 0; number < 8; number++)
- {
- if(maxValuesBuffer[number] > max)
- {
- index = maxIndexesBuffer[number];
- max = maxValuesBuffer[number];
- }
- else if(maxValuesBuffer[number] == max){
- if (index > maxIndexesBuffer[number])
- index = maxIndexesBuffer[number];
- }
- }
-
- number = quarterPoints * 8;
- for(;number < num_points; number++)
- {
- if(src0[number] > max)
- {
- index = number;
- max = src0[number];
- }
- }
- target[0] = (uint32_t)index;
+ if (num_points > 0) {
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 8;
+
+ float* inputPtr = (float*)src0;
+
+ __m256 indexIncrementValues = _mm256_set1_ps(8);
+ __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
+
+ float max = src0[0];
+ float index = 0;
+ __m256 maxValues = _mm256_set1_ps(max);
+ __m256 maxValuesIndex = _mm256_setzero_ps();
+ __m256 compareResults;
+ __m256 currentValues;
+
+ __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
+ __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
+
+ for (; number < quarterPoints; number++) {
+ currentValues = _mm256_load_ps(inputPtr);
+ inputPtr += 8;
+ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+ compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
+ maxValuesIndex =
+ _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+ maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
+ }
+
+ // Calculate the largest value from the remaining 8 points
+ _mm256_store_ps(maxValuesBuffer, maxValues);
+ _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for (number = 0; number < 8; number++) {
+ if (maxValuesBuffer[number] > max) {
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ } else if (maxValuesBuffer[number] == max) {
+ if (index > maxIndexesBuffer[number])
+ index = maxIndexesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 8;
+ for (; number < num_points; number++) {
+ if (src0[number] > max) {
+ index = number;
+ max = src0[number];
+ }
}
+ target[0] = (uint32_t)index;
+ }
}
#endif /*LV_HAVE_AVX*/
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points)
+static inline void
+volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points)
{
- if(num_points > 0)
- {
- uint32_t number = 0;
- const uint32_t quarterPoints = num_points / 4;
-
- float* inputPtr = (float*)src0;
- float32x4_t indexIncrementValues = vdupq_n_f32(4);
- __VOLK_ATTR_ALIGNED(16) float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
- float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
-
- float max = src0[0];
- float index = 0;
- float32x4_t maxValues = vdupq_n_f32(max);
- uint32x4_t maxValuesIndex = vmovq_n_u32(0);
- uint32x4_t compareResults;
- uint32x4_t currentIndexes_u;
- float32x4_t currentValues;
-
- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
-
- for(;number < quarterPoints; number++)
- {
- currentValues = vld1q_f32(inputPtr); inputPtr += 4;
- currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
- currentIndexes_u = vcvtq_u32_f32(currentIndexes);
- compareResults = vcleq_f32(currentValues, maxValues);
- maxValuesIndex = vorrq_u32( vandq_u32( compareResults, maxValuesIndex ), vbicq_u32(currentIndexes_u, compareResults) );
- maxValues = vmaxq_f32(currentValues, maxValues);
- }
-
- // Calculate the largest value from the remaining 4 points
- vst1q_f32(maxValuesBuffer, maxValues);
- vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex));
- for(number = 0; number < 4; number++)
- {
- if(maxValuesBuffer[number] > max)
- {
- index = maxIndexesBuffer[number];
- max = maxValuesBuffer[number];
- }
- else if(maxValues[number] == max){
- if (index > maxIndexesBuffer[number])
- index = maxIndexesBuffer[number];
- }
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++)
- {
- if(src0[number] > max)
- {
- index = number;
- max = src0[number];
- }
- }
- target[0] = (uint32_t)index;
+ if (num_points > 0) {
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+ float32x4_t indexIncrementValues = vdupq_n_f32(4);
+ __VOLK_ATTR_ALIGNED(16)
+ float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
+ float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
+
+ float max = src0[0];
+ float index = 0;
+ float32x4_t maxValues = vdupq_n_f32(max);
+ uint32x4_t maxValuesIndex = vmovq_n_u32(0);
+ uint32x4_t compareResults;
+ uint32x4_t currentIndexes_u;
+ float32x4_t currentValues;
+
+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+ for (; number < quarterPoints; number++) {
+ currentValues = vld1q_f32(inputPtr);
+ inputPtr += 4;
+ currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
+ currentIndexes_u = vcvtq_u32_f32(currentIndexes);
+ compareResults = vcleq_f32(currentValues, maxValues);
+ maxValuesIndex = vorrq_u32(vandq_u32(compareResults, maxValuesIndex),
+ vbicq_u32(currentIndexes_u, compareResults));
+ maxValues = vmaxq_f32(currentValues, maxValues);
+ }
+
+ // Calculate the largest value from the remaining 4 points
+ vst1q_f32(maxValuesBuffer, maxValues);
+ vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex));
+ for (number = 0; number < 4; number++) {
+ if (maxValuesBuffer[number] > max) {
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ } else if (maxValues[number] == max) {
+ if (index > maxIndexesBuffer[number])
+ index = maxIndexesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ if (src0[number] > max) {
+ index = number;
+ max = src0[number];
+ }
}
+ target[0] = (uint32_t)index;
+ }
}
#endif /*LV_HAVE_NEON*/
static inline void
volk_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points)
{
- if(num_points > 0){
- float max = src0[0];
- uint32_t index = 0;
+ if (num_points > 0) {
+ float max = src0[0];
+ uint32_t index = 0;
- uint32_t i = 1;
+ uint32_t i = 1;
- for(; i < num_points; ++i) {
- if(src0[i] > max){
- index = i;
- max = src0[i];
- }
+ for (; i < num_points; ++i) {
+ if (src0[i] > max) {
+ index = i;
+ max = src0[i];
+ }
+ }
+ target[0] = index;
}
- target[0] = index;
- }
}
#endif /*LV_HAVE_GENERIC*/
#ifndef INCLUDED_volk_32f_index_max_32u_u_H
#define INCLUDED_volk_32f_index_max_32u_u_H
-#include <volk/volk_common.h>
-#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void volk_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points)
+static inline void
+volk_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points)
{
- if(num_points > 0)
- {
- uint32_t number = 0;
- const uint32_t quarterPoints = num_points / 8;
-
- float* inputPtr = (float*)src0;
-
- __m256 indexIncrementValues = _mm256_set1_ps(8);
- __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
-
- float max = src0[0];
- float index = 0;
- __m256 maxValues = _mm256_set1_ps(max);
- __m256 maxValuesIndex = _mm256_setzero_ps();
- __m256 compareResults;
- __m256 currentValues;
-
- __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
- __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
-
- for(;number < quarterPoints; number++)
- {
- currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8;
- currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
- compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
- maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
- maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
- }
-
- // Calculate the largest value from the remaining 8 points
- _mm256_store_ps(maxValuesBuffer, maxValues);
- _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
-
- for(number = 0; number < 8; number++)
- {
- if(maxValuesBuffer[number] > max)
- {
- index = maxIndexesBuffer[number];
- max = maxValuesBuffer[number];
- }
- else if(maxValuesBuffer[number] == max){
- if (index > maxIndexesBuffer[number])
- index = maxIndexesBuffer[number];
- }
- }
-
- number = quarterPoints * 8;
- for(;number < num_points; number++)
- {
- if(src0[number] > max)
- {
- index = number;
- max = src0[number];
- }
- }
- target[0] = (uint32_t)index;
+ if (num_points > 0) {
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 8;
+
+ float* inputPtr = (float*)src0;
+
+ __m256 indexIncrementValues = _mm256_set1_ps(8);
+ __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
+
+ float max = src0[0];
+ float index = 0;
+ __m256 maxValues = _mm256_set1_ps(max);
+ __m256 maxValuesIndex = _mm256_setzero_ps();
+ __m256 compareResults;
+ __m256 currentValues;
+
+ __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
+ __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
+
+ for (; number < quarterPoints; number++) {
+ currentValues = _mm256_loadu_ps(inputPtr);
+ inputPtr += 8;
+ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+ compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
+ maxValuesIndex =
+ _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+ maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
}
+
+ // Calculate the largest value from the remaining 8 points
+ _mm256_store_ps(maxValuesBuffer, maxValues);
+ _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for (number = 0; number < 8; number++) {
+ if (maxValuesBuffer[number] > max) {
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ } else if (maxValuesBuffer[number] == max) {
+ if (index > maxIndexesBuffer[number])
+ index = maxIndexesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 8;
+ for (; number < num_points; number++) {
+ if (src0[number] > max) {
+ index = number;
+ max = src0[number];
+ }
+ }
+ target[0] = (uint32_t)index;
+ }
}
#endif /*LV_HAVE_AVX*/
#ifdef LV_HAVE_SSE4_1
-#include<smmintrin.h>
+#include <smmintrin.h>
-static inline void volk_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
+static inline void
+volk_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
{
- if(num_points > 0)
- {
- uint32_t number = 0;
- const uint32_t quarterPoints = num_points / 4;
-
- float* inputPtr = (float*)src0;
-
- __m128 indexIncrementValues = _mm_set1_ps(4);
- __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
-
- float max = src0[0];
- float index = 0;
- __m128 maxValues = _mm_set1_ps(max);
- __m128 maxValuesIndex = _mm_setzero_ps();
- __m128 compareResults;
- __m128 currentValues;
-
- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
-
- for(;number < quarterPoints; number++)
- {
- currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4;
- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
- compareResults = _mm_cmpgt_ps(currentValues, maxValues);
- maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
- maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
- }
-
- // Calculate the largest value from the remaining 4 points
- _mm_store_ps(maxValuesBuffer, maxValues);
- _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
-
- for(number = 0; number < 4; number++)
- {
- if(maxValuesBuffer[number] > max)
- {
- index = maxIndexesBuffer[number];
- max = maxValuesBuffer[number];
- }
- else if(maxValuesBuffer[number] == max){
- if (index > maxIndexesBuffer[number])
- index = maxIndexesBuffer[number];
- }
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++)
- {
- if(src0[number] > max)
- {
- index = number;
- max = src0[number];
- }
- }
- target[0] = (uint32_t)index;
+ if (num_points > 0) {
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+ float max = src0[0];
+ float index = 0;
+ __m128 maxValues = _mm_set1_ps(max);
+ __m128 maxValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
+
+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+ for (; number < quarterPoints; number++) {
+ currentValues = _mm_loadu_ps(inputPtr);
+ inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+ compareResults = _mm_cmpgt_ps(currentValues, maxValues);
+ maxValuesIndex =
+ _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+ maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
}
+
+ // Calculate the largest value from the remaining 4 points
+ _mm_store_ps(maxValuesBuffer, maxValues);
+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for (number = 0; number < 4; number++) {
+ if (maxValuesBuffer[number] > max) {
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ } else if (maxValuesBuffer[number] == max) {
+ if (index > maxIndexesBuffer[number])
+ index = maxIndexesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ if (src0[number] > max) {
+ index = number;
+ max = src0[number];
+ }
+ }
+ target[0] = (uint32_t)index;
+ }
}
#endif /*LV_HAVE_SSE4_1*/
#ifdef LV_HAVE_SSE
-#include<xmmintrin.h>
+#include <xmmintrin.h>
-static inline void volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points)
+static inline void
+volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points)
{
- if(num_points > 0)
- {
- uint32_t number = 0;
- const uint32_t quarterPoints = num_points / 4;
-
- float* inputPtr = (float*)src0;
-
- __m128 indexIncrementValues = _mm_set1_ps(4);
- __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
-
- float max = src0[0];
- float index = 0;
- __m128 maxValues = _mm_set1_ps(max);
- __m128 maxValuesIndex = _mm_setzero_ps();
- __m128 compareResults;
- __m128 currentValues;
-
- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
-
- for(;number < quarterPoints; number++)
- {
- currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4;
- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
- compareResults = _mm_cmpgt_ps(currentValues, maxValues);
- maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
- _mm_andnot_ps(compareResults, maxValuesIndex));
- maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
- _mm_andnot_ps(compareResults, maxValues));
- }
-
- // Calculate the largest value from the remaining 4 points
- _mm_store_ps(maxValuesBuffer, maxValues);
- _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
-
- for(number = 0; number < 4; number++)
- {
- if(maxValuesBuffer[number] > max)
- {
- index = maxIndexesBuffer[number];
- max = maxValuesBuffer[number];
- }
- else if(maxValuesBuffer[number] == max){
- if (index > maxIndexesBuffer[number])
- index = maxIndexesBuffer[number];
- }
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++)
- {
- if(src0[number] > max)
- {
- index = number;
- max = src0[number];
- }
- }
- target[0] = (uint32_t)index;
+ if (num_points > 0) {
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+ float max = src0[0];
+ float index = 0;
+ __m128 maxValues = _mm_set1_ps(max);
+ __m128 maxValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
+
+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+ for (; number < quarterPoints; number++) {
+ currentValues = _mm_loadu_ps(inputPtr);
+ inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+ compareResults = _mm_cmpgt_ps(currentValues, maxValues);
+ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
+ _mm_andnot_ps(compareResults, maxValuesIndex));
+ maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
+ _mm_andnot_ps(compareResults, maxValues));
}
+
+ // Calculate the largest value from the remaining 4 points
+ _mm_store_ps(maxValuesBuffer, maxValues);
+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for (number = 0; number < 4; number++) {
+ if (maxValuesBuffer[number] > max) {
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ } else if (maxValuesBuffer[number] == max) {
+ if (index > maxIndexesBuffer[number])
+ index = maxIndexesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ if (src0[number] > max) {
+ index = number;
+ max = src0[number];
+ }
+ }
+ target[0] = (uint32_t)index;
+ }
}
#endif /*LV_HAVE_SSE*/
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_invsqrt_32f(float* cVector, const float* aVector, unsigned int num_points)
- * \endcode
+ * void volk_32f_invsqrt_32f(float* cVector, const float* aVector, unsigned int
+ * num_points) \endcode
*
* \b Inputs
* \li aVector: the input vector of floats.
#define INCLUDED_volk_32f_invsqrt_32f_a_H
#include <inttypes.h>
-#include <stdio.h>
#include <math.h>
+#include <stdio.h>
#include <string.h>
-static inline float
-Q_rsqrt(float number)
+static inline float Q_rsqrt(float number)
{
- float x2;
- const float threehalfs = 1.5F;
- union f32_to_i32 {
- int32_t i;
- float f;
- } u;
-
- x2 = number * 0.5F;
- u.f = number;
- u.i = 0x5f3759df - ( u.i >> 1 ); // what the fuck?
- u.f = u.f * ( threehalfs - ( x2 * u.f * u.f ) ); // 1st iteration
- //u.f = u.f * ( threehalfs - ( x2 * u.f * u.f ) ); // 2nd iteration, this can be removed
-
- return u.f;
+ float x2;
+ const float threehalfs = 1.5F;
+ union f32_to_i32 {
+ int32_t i;
+ float f;
+ } u;
+
+ x2 = number * 0.5F;
+ u.f = number;
+ u.i = 0x5f3759df - (u.i >> 1); // what the fuck?
+ u.f = u.f * (threehalfs - (x2 * u.f * u.f)); // 1st iteration
+ // u.f = u.f * ( threehalfs - ( x2 * u.f * u.f ) ); // 2nd iteration, this can be
+ // removed
+
+ return u.f;
}
#ifdef LV_HAVE_AVX
static inline void
volk_32f_invsqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- float* cPtr = cVector;
- const float* aPtr = aVector;
- __m256 aVal, cVal;
- for (; number < eighthPoints; number++) {
- aVal = _mm256_load_ps(aPtr);
- cVal = _mm256_rsqrt_ps(aVal);
- _mm256_store_ps(cPtr, cVal);
- aPtr += 8;
- cPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++)
- *cPtr++ = Q_rsqrt(*aPtr++);
-
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ __m256 aVal, cVal;
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_load_ps(aPtr);
+ cVal = _mm256_rsqrt_ps(aVal);
+ _mm256_store_ps(cPtr, cVal);
+ aPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++)
+ *cPtr++ = Q_rsqrt(*aPtr++);
}
#endif /* LV_HAVE_AVX */
static inline void
volk_32f_invsqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- float* cPtr = cVector;
- const float* aPtr = aVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
- __m128 aVal, cVal;
- for(;number < quarterPoints; number++){
+ __m128 aVal, cVal;
+ for (; number < quarterPoints; number++) {
- aVal = _mm_load_ps(aPtr);
+ aVal = _mm_load_ps(aPtr);
- cVal = _mm_rsqrt_ps(aVal);
+ cVal = _mm_rsqrt_ps(aVal);
- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ cPtr += 4;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++) {
- *cPtr++ = Q_rsqrt(*aPtr++);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *cPtr++ = Q_rsqrt(*aPtr++);
+ }
}
#endif /* LV_HAVE_SSE */
static inline void
volk_32f_invsqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
{
- unsigned int number;
- const unsigned int quarter_points = num_points / 4;
-
- float* cPtr = cVector;
- const float* aPtr = aVector;
- float32x4_t a_val, c_val;
- for (number = 0; number < quarter_points; ++number) {
- a_val = vld1q_f32(aPtr);
- c_val = vrsqrteq_f32(a_val);
- vst1q_f32(cPtr, c_val);
- aPtr += 4;
- cPtr += 4;
- }
-
- for(number=quarter_points * 4;number < num_points; number++)
- *cPtr++ = Q_rsqrt(*aPtr++);
+ unsigned int number;
+ const unsigned int quarter_points = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ float32x4_t a_val, c_val;
+ for (number = 0; number < quarter_points; ++number) {
+ a_val = vld1q_f32(aPtr);
+ c_val = vrsqrteq_f32(a_val);
+ vst1q_f32(cPtr, c_val);
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ for (number = quarter_points * 4; number < num_points; number++)
+ *cPtr++ = Q_rsqrt(*aPtr++);
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_invsqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
+static inline void volk_32f_invsqrt_32f_generic(float* cVector,
+ const float* aVector,
+ unsigned int num_points)
{
- float* cPtr = cVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
- for(number = 0; number < num_points; number++) {
- *cPtr++ = Q_rsqrt(*aPtr++);
- }
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = Q_rsqrt(*aPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
static inline void
volk_32f_invsqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- float* cPtr = cVector;
- const float* aPtr = aVector;
- __m256 aVal, cVal;
- for (; number < eighthPoints; number++) {
- aVal = _mm256_loadu_ps(aPtr);
- cVal = _mm256_rsqrt_ps(aVal);
- _mm256_storeu_ps(cPtr, cVal);
- aPtr += 8;
- cPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++)
- *cPtr++ = Q_rsqrt(*aPtr++);
-
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ __m256 aVal, cVal;
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_loadu_ps(aPtr);
+ cVal = _mm256_rsqrt_ps(aVal);
+ _mm256_storeu_ps(cPtr, cVal);
+ aPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++)
+ *cPtr++ = Q_rsqrt(*aPtr++);
}
#endif /* LV_HAVE_AVX */
#ifndef INCLUDED_volk_32f_log2_32f_a_H
#define INCLUDED_volk_32f_log2_32f_a_H
-#include <stdio.h>
-#include <stdlib.h>
#include <inttypes.h>
#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
#define LOG_POLY_DEGREE 6
// +-Inf -> +-127.0f in order to match the behaviour of the SIMD kernels
-static inline float log2f_non_ieee(float f) {
- float const result = log2f(f);
- return isinf(result) ? copysignf(127.0f, result) : result;
+static inline float log2f_non_ieee(float f)
+{
+ float const result = log2f(f);
+ return isinf(result) ? copysignf(127.0f, result) : result;
}
#ifdef LV_HAVE_GENERIC
static inline void
volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
- for(number = 0; number < num_points; number++)
- *bPtr++ = log2f_non_ieee(*aPtr++);
+ for (number = 0; number < num_points; number++)
+ *bPtr++ = log2f_non_ieee(*aPtr++);
}
#endif /* LV_HAVE_GENERIC */
#include <immintrin.h>
#define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
-#define POLY1_FMAAVX2(x, c0, c1) _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
-#define POLY2_FMAAVX2(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
-#define POLY3_FMAAVX2(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
-#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
-#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
-
-static inline void
-volk_32f_log2_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+#define POLY1_FMAAVX2(x, c0, c1) \
+ _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
+#define POLY2_FMAAVX2(x, c0, c1, c2) \
+ _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
+#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
+ _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
+#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
+ _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
+#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
+ _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
+
+static inline void volk_32f_log2_32f_a_avx2_fma(float* bVector,
+ const float* aVector,
+ unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- __m256 aVal, bVal, mantissa, frac, leadingOne;
- __m256i bias, exp;
+ __m256 aVal, bVal, mantissa, frac, leadingOne;
+ __m256i bias, exp;
- for(;number < eighthPoints; number++){
+ for (; number < eighthPoints; number++) {
- aVal = _mm256_load_ps(aPtr);
- bias = _mm256_set1_epi32(127);
- leadingOne = _mm256_set1_ps(1.0f);
- exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
- bVal = _mm256_cvtepi32_ps(exp);
+ aVal = _mm256_load_ps(aPtr);
+ bias = _mm256_set1_epi32(127);
+ leadingOne = _mm256_set1_ps(1.0f);
+ exp = _mm256_sub_epi32(
+ _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
+ _mm256_set1_epi32(0x7f800000)),
+ 23),
+ bias);
+ bVal = _mm256_cvtepi32_ps(exp);
- // Now to extract mantissa
- frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+ // Now to extract mantissa
+ frac = _mm256_or_ps(
+ leadingOne,
+ _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
#if LOG_POLY_DEGREE == 6
- mantissa = POLY5_FMAAVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+ mantissa = POLY5_FMAAVX2(frac,
+ 3.1157899f,
+ -3.3241990f,
+ 2.5988452f,
+ -1.2315303f,
+ 3.1821337e-1f,
+ -3.4436006e-2f);
#elif LOG_POLY_DEGREE == 5
- mantissa = POLY4_FMAAVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+ mantissa = POLY4_FMAAVX2(frac,
+ 2.8882704548164776201f,
+ -2.52074962577807006663f,
+ 1.48116647521213171641f,
+ -0.465725644288844778798f,
+ 0.0596515482674574969533f);
#elif LOG_POLY_DEGREE == 4
- mantissa = POLY3_FMAAVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+ mantissa = POLY3_FMAAVX2(frac,
+ 2.61761038894603480148f,
+ -1.75647175389045657003f,
+ 0.688243882994381274313f,
+ -0.107254423828329604454f);
#elif LOG_POLY_DEGREE == 3
- mantissa = POLY2_FMAAVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+ mantissa = POLY2_FMAAVX2(frac,
+ 2.28330284476918490682f,
+ -1.04913055217340124191f,
+ 0.204446009836232697516f);
#else
#error
#endif
- bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
- _mm256_store_ps(bPtr, bVal);
+ bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
+ _mm256_store_ps(bPtr, bVal);
- aPtr += 8;
- bPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ }
- number = eighthPoints * 8;
- volk_32f_log2_32f_generic(bPtr, aPtr, num_points-number);
+ number = eighthPoints * 8;
+ volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
#include <immintrin.h>
#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
-#define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
-#define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
-#define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
-#define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
-#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
+#define POLY1_AVX2(x, c0, c1) \
+ _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
+#define POLY2_AVX2(x, c0, c1, c2) \
+ _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
+#define POLY3_AVX2(x, c0, c1, c2, c3) \
+ _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
+#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
+ _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
+#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
+ _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
static inline void
volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- __m256 aVal, bVal, mantissa, frac, leadingOne;
- __m256i bias, exp;
+ __m256 aVal, bVal, mantissa, frac, leadingOne;
+ __m256i bias, exp;
- for(;number < eighthPoints; number++){
+ for (; number < eighthPoints; number++) {
- aVal = _mm256_load_ps(aPtr);
- bias = _mm256_set1_epi32(127);
- leadingOne = _mm256_set1_ps(1.0f);
- exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
- bVal = _mm256_cvtepi32_ps(exp);
+ aVal = _mm256_load_ps(aPtr);
+ bias = _mm256_set1_epi32(127);
+ leadingOne = _mm256_set1_ps(1.0f);
+ exp = _mm256_sub_epi32(
+ _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
+ _mm256_set1_epi32(0x7f800000)),
+ 23),
+ bias);
+ bVal = _mm256_cvtepi32_ps(exp);
- // Now to extract mantissa
- frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+ // Now to extract mantissa
+ frac = _mm256_or_ps(
+ leadingOne,
+ _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
#if LOG_POLY_DEGREE == 6
- mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+ mantissa = POLY5_AVX2(frac,
+ 3.1157899f,
+ -3.3241990f,
+ 2.5988452f,
+ -1.2315303f,
+ 3.1821337e-1f,
+ -3.4436006e-2f);
#elif LOG_POLY_DEGREE == 5
- mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+ mantissa = POLY4_AVX2(frac,
+ 2.8882704548164776201f,
+ -2.52074962577807006663f,
+ 1.48116647521213171641f,
+ -0.465725644288844778798f,
+ 0.0596515482674574969533f);
#elif LOG_POLY_DEGREE == 4
- mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+ mantissa = POLY3_AVX2(frac,
+ 2.61761038894603480148f,
+ -1.75647175389045657003f,
+ 0.688243882994381274313f,
+ -0.107254423828329604454f);
#elif LOG_POLY_DEGREE == 3
- mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+ mantissa = POLY2_AVX2(frac,
+ 2.28330284476918490682f,
+ -1.04913055217340124191f,
+ 0.204446009836232697516f);
#else
#error
#endif
- bVal = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
- _mm256_store_ps(bPtr, bVal);
+ bVal =
+ _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
+ _mm256_store_ps(bPtr, bVal);
- aPtr += 8;
- bPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ }
- number = eighthPoints * 8;
- volk_32f_log2_32f_generic(bPtr, aPtr, num_points-number);
+ number = eighthPoints * 8;
+ volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
}
#endif /* LV_HAVE_AVX2 for aligned */
#define POLY0(x, c0) _mm_set1_ps(c0)
#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
-#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
-#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
-#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) \
+ _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) \
+ _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) \
+ _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
static inline void
volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- __m128 aVal, bVal, mantissa, frac, leadingOne;
- __m128i bias, exp;
+ __m128 aVal, bVal, mantissa, frac, leadingOne;
+ __m128i bias, exp;
- for(;number < quarterPoints; number++){
+ for (; number < quarterPoints; number++) {
- aVal = _mm_load_ps(aPtr);
- bias = _mm_set1_epi32(127);
- leadingOne = _mm_set1_ps(1.0f);
- exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
- bVal = _mm_cvtepi32_ps(exp);
+ aVal = _mm_load_ps(aPtr);
+ bias = _mm_set1_epi32(127);
+ leadingOne = _mm_set1_ps(1.0f);
+ exp = _mm_sub_epi32(
+ _mm_srli_epi32(
+ _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
+ bias);
+ bVal = _mm_cvtepi32_ps(exp);
- // Now to extract mantissa
- frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+ // Now to extract mantissa
+ frac = _mm_or_ps(leadingOne,
+ _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
#if LOG_POLY_DEGREE == 6
- mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+ mantissa = POLY5(frac,
+ 3.1157899f,
+ -3.3241990f,
+ 2.5988452f,
+ -1.2315303f,
+ 3.1821337e-1f,
+ -3.4436006e-2f);
#elif LOG_POLY_DEGREE == 5
- mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+ mantissa = POLY4(frac,
+ 2.8882704548164776201f,
+ -2.52074962577807006663f,
+ 1.48116647521213171641f,
+ -0.465725644288844778798f,
+ 0.0596515482674574969533f);
#elif LOG_POLY_DEGREE == 4
- mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+ mantissa = POLY3(frac,
+ 2.61761038894603480148f,
+ -1.75647175389045657003f,
+ 0.688243882994381274313f,
+ -0.107254423828329604454f);
#elif LOG_POLY_DEGREE == 3
- mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+ mantissa = POLY2(frac,
+ 2.28330284476918490682f,
+ -1.04913055217340124191f,
+ 0.204446009836232697516f);
#else
#error
#endif
- bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
- _mm_store_ps(bPtr, bVal);
+ bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
+ _mm_store_ps(bPtr, bVal);
- aPtr += 4;
- bPtr += 4;
- }
+ aPtr += 4;
+ bPtr += 4;
+ }
- number = quarterPoints * 4;
- volk_32f_log2_32f_generic(bPtr, aPtr, num_points-number);
+ number = quarterPoints * 4;
+ volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
}
#endif /* LV_HAVE_SSE4_1 for aligned */
#include <arm_neon.h>
/* these macros allow us to embed logs in other kernels */
-#define VLOG2Q_NEON_PREAMBLE() \
- int32x4_t one = vdupq_n_s32(0x000800000); \
- /* minimax polynomial */ \
- float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
- float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
- float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
- float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
- float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
- float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
- float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
- int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
- int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
- int32x4_t exp_bias = vdupq_n_s32(127);
-
-
-#define VLOG2Q_NEON_F32(log2_approx, aval) \
- int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
- int32x4_t significand_i = vandq_s32(aval, sig_mask); \
- exponent_i = vshrq_n_s32(exponent_i, 23); \
- \
- /* extract the exponent and significand \
- we can treat this as fixed point to save ~9% on the \
- conversion + float add */ \
- significand_i = vorrq_s32(one, significand_i); \
- float32x4_t significand_f = vcvtq_n_f32_s32(significand_i,23); \
- /* debias the exponent and convert to float */ \
- exponent_i = vsubq_s32(exponent_i, exp_bias); \
- float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
- \
- /* put the significand through a polynomial fit of log2(x) [1,2] \
- add the result to the exponent */ \
- log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \
- float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \
- log2_approx = vaddq_f32(log2_approx, tmp1); \
- float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \
- tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \
- log2_approx = vaddq_f32(log2_approx, tmp1); \
- \
- float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \
- tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \
- log2_approx = vaddq_f32(log2_approx, tmp1); \
- float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \
- tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \
- log2_approx = vaddq_f32(log2_approx, tmp1); \
- float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \
- tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \
- log2_approx = vaddq_f32(log2_approx, tmp1); \
- float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \
- tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \
- log2_approx = vaddq_f32(log2_approx, tmp1);
+#define VLOG2Q_NEON_PREAMBLE() \
+ int32x4_t one = vdupq_n_s32(0x000800000); \
+ /* minimax polynomial */ \
+ float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
+ float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
+ float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
+ float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
+ float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
+ float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
+ float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
+ int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
+ int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
+ int32x4_t exp_bias = vdupq_n_s32(127);
+
+
+#define VLOG2Q_NEON_F32(log2_approx, aval) \
+ int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
+ int32x4_t significand_i = vandq_s32(aval, sig_mask); \
+ exponent_i = vshrq_n_s32(exponent_i, 23); \
+ \
+ /* extract the exponent and significand \
+ we can treat this as fixed point to save ~9% on the \
+ conversion + float add */ \
+ significand_i = vorrq_s32(one, significand_i); \
+ float32x4_t significand_f = vcvtq_n_f32_s32(significand_i, 23); \
+ /* debias the exponent and convert to float */ \
+ exponent_i = vsubq_s32(exponent_i, exp_bias); \
+ float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
+ \
+ /* put the significand through a polynomial fit of log2(x) [1,2] \
+ add the result to the exponent */ \
+ log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \
+ float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \
+ tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ \
+ float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \
+ tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \
+ tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \
+ tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \
+ tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1);
static inline void
volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
- unsigned int number;
- const unsigned int quarterPoints = num_points / 4;
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number;
+ const unsigned int quarterPoints = num_points / 4;
- int32x4_t aval;
- float32x4_t log2_approx;
+ int32x4_t aval;
+ float32x4_t log2_approx;
- VLOG2Q_NEON_PREAMBLE()
- // lms
- //p0 = vdupq_n_f32(-1.649132280361871);
- //p1 = vdupq_n_f32(1.995047138579499);
- //p2 = vdupq_n_f32(-0.336914839219728);
+ VLOG2Q_NEON_PREAMBLE()
+ // lms
+ // p0 = vdupq_n_f32(-1.649132280361871);
+ // p1 = vdupq_n_f32(1.995047138579499);
+ // p2 = vdupq_n_f32(-0.336914839219728);
- // keep in mind a single precision float is represented as
- // (-1)^sign * 2^exp * 1.significand, so the log2 is
- // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
- for(number = 0; number < quarterPoints; ++number){
- // load float in to an int register without conversion
- aval = vld1q_s32((int*)aPtr);
+ // keep in mind a single precision float is represented as
+ // (-1)^sign * 2^exp * 1.significand, so the log2 is
+ // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
+ for (number = 0; number < quarterPoints; ++number) {
+ // load float in to an int register without conversion
+ aval = vld1q_s32((int*)aPtr);
- VLOG2Q_NEON_F32(log2_approx, aval)
+ VLOG2Q_NEON_F32(log2_approx, aval)
- vst1q_f32(bPtr, log2_approx);
+ vst1q_f32(bPtr, log2_approx);
- aPtr += 4;
- bPtr += 4;
- }
+ aPtr += 4;
+ bPtr += 4;
+ }
- number = quarterPoints * 4;
- volk_32f_log2_32f_generic(bPtr, aPtr, num_points-number);
+ number = quarterPoints * 4;
+ volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
}
#endif /* LV_HAVE_NEON */
static inline void
volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- float const result = log2f(*aPtr++);
- *bPtr++ = isinf(result) ? -127.0f : result;
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ float const result = log2f(*aPtr++);
+ *bPtr++ = isinf(result) ? -127.0f : result;
+ }
}
#endif /* LV_HAVE_GENERIC */
#define POLY0(x, c0) _mm_set1_ps(c0)
#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
-#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
-#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
-#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) \
+ _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) \
+ _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) \
+ _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
static inline void
volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- __m128 aVal, bVal, mantissa, frac, leadingOne;
- __m128i bias, exp;
+ __m128 aVal, bVal, mantissa, frac, leadingOne;
+ __m128i bias, exp;
- for(;number < quarterPoints; number++){
+ for (; number < quarterPoints; number++) {
- aVal = _mm_loadu_ps(aPtr);
- bias = _mm_set1_epi32(127);
- leadingOne = _mm_set1_ps(1.0f);
- exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
- bVal = _mm_cvtepi32_ps(exp);
+ aVal = _mm_loadu_ps(aPtr);
+ bias = _mm_set1_epi32(127);
+ leadingOne = _mm_set1_ps(1.0f);
+ exp = _mm_sub_epi32(
+ _mm_srli_epi32(
+ _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
+ bias);
+ bVal = _mm_cvtepi32_ps(exp);
- // Now to extract mantissa
- frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+ // Now to extract mantissa
+ frac = _mm_or_ps(leadingOne,
+ _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
#if LOG_POLY_DEGREE == 6
- mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+ mantissa = POLY5(frac,
+ 3.1157899f,
+ -3.3241990f,
+ 2.5988452f,
+ -1.2315303f,
+ 3.1821337e-1f,
+ -3.4436006e-2f);
#elif LOG_POLY_DEGREE == 5
- mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+ mantissa = POLY4(frac,
+ 2.8882704548164776201f,
+ -2.52074962577807006663f,
+ 1.48116647521213171641f,
+ -0.465725644288844778798f,
+ 0.0596515482674574969533f);
#elif LOG_POLY_DEGREE == 4
- mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+ mantissa = POLY3(frac,
+ 2.61761038894603480148f,
+ -1.75647175389045657003f,
+ 0.688243882994381274313f,
+ -0.107254423828329604454f);
#elif LOG_POLY_DEGREE == 3
- mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+ mantissa = POLY2(frac,
+ 2.28330284476918490682f,
+ -1.04913055217340124191f,
+ 0.204446009836232697516f);
#else
#error
#endif
- bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
- _mm_storeu_ps(bPtr, bVal);
+ bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
+ _mm_storeu_ps(bPtr, bVal);
- aPtr += 4;
- bPtr += 4;
- }
+ aPtr += 4;
+ bPtr += 4;
+ }
- number = quarterPoints * 4;
- volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points-number);
+ number = quarterPoints * 4;
+ volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
}
#endif /* LV_HAVE_SSE4_1 for unaligned */
#include <immintrin.h>
#define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
-#define POLY1_FMAAVX2(x, c0, c1) _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
-#define POLY2_FMAAVX2(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
-#define POLY3_FMAAVX2(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
-#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
-#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
-
-static inline void
-volk_32f_log2_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+#define POLY1_FMAAVX2(x, c0, c1) \
+ _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
+#define POLY2_FMAAVX2(x, c0, c1, c2) \
+ _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
+#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
+ _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
+#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
+ _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
+#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
+ _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
+
+static inline void volk_32f_log2_32f_u_avx2_fma(float* bVector,
+ const float* aVector,
+ unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- __m256 aVal, bVal, mantissa, frac, leadingOne;
- __m256i bias, exp;
+ __m256 aVal, bVal, mantissa, frac, leadingOne;
+ __m256i bias, exp;
- for(;number < eighthPoints; number++){
+ for (; number < eighthPoints; number++) {
- aVal = _mm256_loadu_ps(aPtr);
- bias = _mm256_set1_epi32(127);
- leadingOne = _mm256_set1_ps(1.0f);
- exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
- bVal = _mm256_cvtepi32_ps(exp);
+ aVal = _mm256_loadu_ps(aPtr);
+ bias = _mm256_set1_epi32(127);
+ leadingOne = _mm256_set1_ps(1.0f);
+ exp = _mm256_sub_epi32(
+ _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
+ _mm256_set1_epi32(0x7f800000)),
+ 23),
+ bias);
+ bVal = _mm256_cvtepi32_ps(exp);
- // Now to extract mantissa
- frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+ // Now to extract mantissa
+ frac = _mm256_or_ps(
+ leadingOne,
+ _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
#if LOG_POLY_DEGREE == 6
- mantissa = POLY5_FMAAVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+ mantissa = POLY5_FMAAVX2(frac,
+ 3.1157899f,
+ -3.3241990f,
+ 2.5988452f,
+ -1.2315303f,
+ 3.1821337e-1f,
+ -3.4436006e-2f);
#elif LOG_POLY_DEGREE == 5
- mantissa = POLY4_FMAAVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+ mantissa = POLY4_FMAAVX2(frac,
+ 2.8882704548164776201f,
+ -2.52074962577807006663f,
+ 1.48116647521213171641f,
+ -0.465725644288844778798f,
+ 0.0596515482674574969533f);
#elif LOG_POLY_DEGREE == 4
- mantissa = POLY3_FMAAVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+ mantissa = POLY3_FMAAVX2(frac,
+ 2.61761038894603480148f,
+ -1.75647175389045657003f,
+ 0.688243882994381274313f,
+ -0.107254423828329604454f);
#elif LOG_POLY_DEGREE == 3
- mantissa = POLY2_FMAAVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+ mantissa = POLY2_FMAAVX2(frac,
+ 2.28330284476918490682f,
+ -1.04913055217340124191f,
+ 0.204446009836232697516f);
#else
#error
#endif
- bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
- _mm256_storeu_ps(bPtr, bVal);
+ bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
+ _mm256_storeu_ps(bPtr, bVal);
- aPtr += 8;
- bPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ }
- number = eighthPoints * 8;
- volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points-number);
+ number = eighthPoints * 8;
+ volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
#include <immintrin.h>
#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
-#define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
-#define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
-#define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
-#define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
-#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
+#define POLY1_AVX2(x, c0, c1) \
+ _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
+#define POLY2_AVX2(x, c0, c1, c2) \
+ _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
+#define POLY3_AVX2(x, c0, c1, c2, c3) \
+ _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
+#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
+ _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
+#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
+ _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
static inline void
volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- __m256 aVal, bVal, mantissa, frac, leadingOne;
- __m256i bias, exp;
+ __m256 aVal, bVal, mantissa, frac, leadingOne;
+ __m256i bias, exp;
- for(;number < eighthPoints; number++){
+ for (; number < eighthPoints; number++) {
- aVal = _mm256_loadu_ps(aPtr);
- bias = _mm256_set1_epi32(127);
- leadingOne = _mm256_set1_ps(1.0f);
- exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
- bVal = _mm256_cvtepi32_ps(exp);
+ aVal = _mm256_loadu_ps(aPtr);
+ bias = _mm256_set1_epi32(127);
+ leadingOne = _mm256_set1_ps(1.0f);
+ exp = _mm256_sub_epi32(
+ _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
+ _mm256_set1_epi32(0x7f800000)),
+ 23),
+ bias);
+ bVal = _mm256_cvtepi32_ps(exp);
- // Now to extract mantissa
- frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+ // Now to extract mantissa
+ frac = _mm256_or_ps(
+ leadingOne,
+ _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
#if LOG_POLY_DEGREE == 6
- mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+ mantissa = POLY5_AVX2(frac,
+ 3.1157899f,
+ -3.3241990f,
+ 2.5988452f,
+ -1.2315303f,
+ 3.1821337e-1f,
+ -3.4436006e-2f);
#elif LOG_POLY_DEGREE == 5
- mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+ mantissa = POLY4_AVX2(frac,
+ 2.8882704548164776201f,
+ -2.52074962577807006663f,
+ 1.48116647521213171641f,
+ -0.465725644288844778798f,
+ 0.0596515482674574969533f);
#elif LOG_POLY_DEGREE == 4
- mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+ mantissa = POLY3_AVX2(frac,
+ 2.61761038894603480148f,
+ -1.75647175389045657003f,
+ 0.688243882994381274313f,
+ -0.107254423828329604454f);
#elif LOG_POLY_DEGREE == 3
- mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+ mantissa = POLY2_AVX2(frac,
+ 2.28330284476918490682f,
+ -1.04913055217340124191f,
+ 0.204446009836232697516f);
#else
#error
#endif
- bVal = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
- _mm256_storeu_ps(bPtr, bVal);
+ bVal =
+ _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
+ _mm256_storeu_ps(bPtr, bVal);
- aPtr += 8;
- bPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ }
- number = eighthPoints * 8;
- volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points-number);
+ number = eighthPoints * 8;
+ volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
}
#endif /* LV_HAVE_AVX2 for unaligned */
* Boston, MA 02110-1301, USA.
*/
-#include <stdio.h>
-#include <math.h>
#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
#ifndef INCLUDED_volk_32f_null_32f_a_H
#define INCLUDED_volk_32f_null_32f_a_H
static inline void
volk_32f_null_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
- unsigned int number;
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number;
- for(number = 0; number < num_points; number++){
- *bPtr++ = *aPtr++;
- }
+ for (number = 0; number < num_points; number++) {
+ *bPtr++ = *aPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_s32f_32f_fm_detect_32f(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points)
- * \endcode
+ * void volk_32f_s32f_32f_fm_detect_32f(float* outputVector, const float* inputVector,
+ * const float bound, float* saveValue, unsigned int num_points) \endcode
*
* \b Inputs
- * \li inputVector: The input vector containing phase data (must be on the interval (-bound, bound]).
- * \li bound: The interval that the input phase data is in, which is used to modulo the differentiation.
- * \li saveValue: A pointer to a float which contains the phase value of the sample before the first input sample.
- * \li num_points The number of data points.
+ * \li inputVector: The input vector containing phase data (must be on the interval
+ * (-bound, bound]). \li bound: The interval that the input phase data is in, which is
+ * used to modulo the differentiation. \li saveValue: A pointer to a float which contains
+ * the phase value of the sample before the first input sample. \li num_points The number
+ * of data points.
*
* \b Outputs
* \li outputVector: The vector where the results will be stored.
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void volk_32f_s32f_32f_fm_detect_32f_a_avx(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
- if (num_points < 1) {
- return;
- }
- unsigned int number = 1;
- unsigned int j = 0;
- // num_points-1 keeps Fedora 7's gcc from crashing...
- // num_points won't work. :(
- const unsigned int eighthPoints = (num_points-1) / 8;
-
- float* outPtr = outputVector;
- const float* inPtr = inputVector;
- __m256 upperBound = _mm256_set1_ps(bound);
- __m256 lowerBound = _mm256_set1_ps(-bound);
- __m256 next3old1;
- __m256 next4;
- __m256 boundAdjust;
- __m256 posBoundAdjust = _mm256_set1_ps(-2*bound); // Subtract when we're above.
- __m256 negBoundAdjust = _mm256_set1_ps(2*bound); // Add when we're below.
- // Do the first 8 by hand since we're going in from the saveValue:
- *outPtr = *inPtr - *saveValue;
- if (*outPtr > bound) *outPtr -= 2*bound;
- if (*outPtr < -bound) *outPtr += 2*bound;
- inPtr++;
- outPtr++;
- for (j = 1; j < ( (8 < num_points) ? 8 : num_points); j++) {
- *outPtr = *(inPtr) - *(inPtr-1);
- if (*outPtr > bound) *outPtr -= 2*bound;
- if (*outPtr < -bound) *outPtr += 2*bound;
- inPtr++;
- outPtr++;
- }
-
- for (; number < eighthPoints; number++) {
- // Load data
- next3old1 = _mm256_loadu_ps((float*) (inPtr-1));
- next4 = _mm256_load_ps(inPtr);
- inPtr += 8;
- // Subtract and store:
- next3old1 = _mm256_sub_ps(next4, next3old1);
- // Bound:
- boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS);
- boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
- next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS);
- next4 = _mm256_and_ps(next4, negBoundAdjust);
- boundAdjust = _mm256_or_ps(next4, boundAdjust);
- // Make sure we're in the bounding interval:
- next3old1 = _mm256_add_ps(next3old1, boundAdjust);
- _mm256_store_ps(outPtr,next3old1); // Store the results back into the output
- outPtr += 8;
- }
-
- for (number = (8 > (eighthPoints*8) ? 8 : (8 * eighthPoints)); number < num_points; number++) {
- *outPtr = *(inPtr) - *(inPtr-1);
- if (*outPtr > bound) *outPtr -= 2*bound;
- if (*outPtr < -bound) *outPtr += 2*bound;
+static inline void volk_32f_s32f_32f_fm_detect_32f_a_avx(float* outputVector,
+ const float* inputVector,
+ const float bound,
+ float* saveValue,
+ unsigned int num_points)
+{
+ if (num_points < 1) {
+ return;
+ }
+ unsigned int number = 1;
+ unsigned int j = 0;
+ // num_points-1 keeps Fedora 7's gcc from crashing...
+ // num_points won't work. :(
+ const unsigned int eighthPoints = (num_points - 1) / 8;
+
+ float* outPtr = outputVector;
+ const float* inPtr = inputVector;
+ __m256 upperBound = _mm256_set1_ps(bound);
+ __m256 lowerBound = _mm256_set1_ps(-bound);
+ __m256 next3old1;
+ __m256 next4;
+ __m256 boundAdjust;
+ __m256 posBoundAdjust = _mm256_set1_ps(-2 * bound); // Subtract when we're above.
+ __m256 negBoundAdjust = _mm256_set1_ps(2 * bound); // Add when we're below.
+ // Do the first 8 by hand since we're going in from the saveValue:
+ *outPtr = *inPtr - *saveValue;
+ if (*outPtr > bound)
+ *outPtr -= 2 * bound;
+ if (*outPtr < -bound)
+ *outPtr += 2 * bound;
inPtr++;
outPtr++;
- }
-
- *saveValue = inputVector[num_points-1];
+ for (j = 1; j < ((8 < num_points) ? 8 : num_points); j++) {
+ *outPtr = *(inPtr) - *(inPtr - 1);
+ if (*outPtr > bound)
+ *outPtr -= 2 * bound;
+ if (*outPtr < -bound)
+ *outPtr += 2 * bound;
+ inPtr++;
+ outPtr++;
+ }
+
+ for (; number < eighthPoints; number++) {
+ // Load data
+ next3old1 = _mm256_loadu_ps((float*)(inPtr - 1));
+ next4 = _mm256_load_ps(inPtr);
+ inPtr += 8;
+ // Subtract and store:
+ next3old1 = _mm256_sub_ps(next4, next3old1);
+ // Bound:
+ boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS);
+ boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
+ next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS);
+ next4 = _mm256_and_ps(next4, negBoundAdjust);
+ boundAdjust = _mm256_or_ps(next4, boundAdjust);
+ // Make sure we're in the bounding interval:
+ next3old1 = _mm256_add_ps(next3old1, boundAdjust);
+ _mm256_store_ps(outPtr, next3old1); // Store the results back into the output
+ outPtr += 8;
+ }
+
+ for (number = (8 > (eighthPoints * 8) ? 8 : (8 * eighthPoints)); number < num_points;
+ number++) {
+ *outPtr = *(inPtr) - *(inPtr - 1);
+ if (*outPtr > bound)
+ *outPtr -= 2 * bound;
+ if (*outPtr < -bound)
+ *outPtr += 2 * bound;
+ inPtr++;
+ outPtr++;
+ }
+
+ *saveValue = inputVector[num_points - 1];
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
- if (num_points < 1) {
- return;
- }
- unsigned int number = 1;
- unsigned int j = 0;
- // num_points-1 keeps Fedora 7's gcc from crashing...
- // num_points won't work. :(
- const unsigned int quarterPoints = (num_points-1) / 4;
-
- float* outPtr = outputVector;
- const float* inPtr = inputVector;
- __m128 upperBound = _mm_set_ps1(bound);
- __m128 lowerBound = _mm_set_ps1(-bound);
- __m128 next3old1;
- __m128 next4;
- __m128 boundAdjust;
- __m128 posBoundAdjust = _mm_set_ps1(-2*bound); // Subtract when we're above.
- __m128 negBoundAdjust = _mm_set_ps1(2*bound); // Add when we're below.
- // Do the first 4 by hand since we're going in from the saveValue:
- *outPtr = *inPtr - *saveValue;
- if (*outPtr > bound) *outPtr -= 2*bound;
- if (*outPtr < -bound) *outPtr += 2*bound;
- inPtr++;
- outPtr++;
- for (j = 1; j < ( (4 < num_points) ? 4 : num_points); j++) {
- *outPtr = *(inPtr) - *(inPtr-1);
- if (*outPtr > bound) *outPtr -= 2*bound;
- if (*outPtr < -bound) *outPtr += 2*bound;
- inPtr++;
- outPtr++;
- }
-
- for (; number < quarterPoints; number++) {
- // Load data
- next3old1 = _mm_loadu_ps((float*) (inPtr-1));
- next4 = _mm_load_ps(inPtr);
- inPtr += 4;
- // Subtract and store:
- next3old1 = _mm_sub_ps(next4, next3old1);
- // Bound:
- boundAdjust = _mm_cmpgt_ps(next3old1, upperBound);
- boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust);
- next4 = _mm_cmplt_ps(next3old1, lowerBound);
- next4 = _mm_and_ps(next4, negBoundAdjust);
- boundAdjust = _mm_or_ps(next4, boundAdjust);
- // Make sure we're in the bounding interval:
- next3old1 = _mm_add_ps(next3old1, boundAdjust);
- _mm_store_ps(outPtr,next3old1); // Store the results back into the output
- outPtr += 4;
- }
-
- for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) {
- *outPtr = *(inPtr) - *(inPtr-1);
- if (*outPtr > bound) *outPtr -= 2*bound;
- if (*outPtr < -bound) *outPtr += 2*bound;
+static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector,
+ const float* inputVector,
+ const float bound,
+ float* saveValue,
+ unsigned int num_points)
+{
+ if (num_points < 1) {
+ return;
+ }
+ unsigned int number = 1;
+ unsigned int j = 0;
+ // num_points-1 keeps Fedora 7's gcc from crashing...
+ // num_points won't work. :(
+ const unsigned int quarterPoints = (num_points - 1) / 4;
+
+ float* outPtr = outputVector;
+ const float* inPtr = inputVector;
+ __m128 upperBound = _mm_set_ps1(bound);
+ __m128 lowerBound = _mm_set_ps1(-bound);
+ __m128 next3old1;
+ __m128 next4;
+ __m128 boundAdjust;
+ __m128 posBoundAdjust = _mm_set_ps1(-2 * bound); // Subtract when we're above.
+ __m128 negBoundAdjust = _mm_set_ps1(2 * bound); // Add when we're below.
+ // Do the first 4 by hand since we're going in from the saveValue:
+ *outPtr = *inPtr - *saveValue;
+ if (*outPtr > bound)
+ *outPtr -= 2 * bound;
+ if (*outPtr < -bound)
+ *outPtr += 2 * bound;
inPtr++;
outPtr++;
- }
-
- *saveValue = inputVector[num_points-1];
+ for (j = 1; j < ((4 < num_points) ? 4 : num_points); j++) {
+ *outPtr = *(inPtr) - *(inPtr - 1);
+ if (*outPtr > bound)
+ *outPtr -= 2 * bound;
+ if (*outPtr < -bound)
+ *outPtr += 2 * bound;
+ inPtr++;
+ outPtr++;
+ }
+
+ for (; number < quarterPoints; number++) {
+ // Load data
+ next3old1 = _mm_loadu_ps((float*)(inPtr - 1));
+ next4 = _mm_load_ps(inPtr);
+ inPtr += 4;
+ // Subtract and store:
+ next3old1 = _mm_sub_ps(next4, next3old1);
+ // Bound:
+ boundAdjust = _mm_cmpgt_ps(next3old1, upperBound);
+ boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust);
+ next4 = _mm_cmplt_ps(next3old1, lowerBound);
+ next4 = _mm_and_ps(next4, negBoundAdjust);
+ boundAdjust = _mm_or_ps(next4, boundAdjust);
+ // Make sure we're in the bounding interval:
+ next3old1 = _mm_add_ps(next3old1, boundAdjust);
+ _mm_store_ps(outPtr, next3old1); // Store the results back into the output
+ outPtr += 4;
+ }
+
+ for (number = (4 > (quarterPoints * 4) ? 4 : (4 * quarterPoints));
+ number < num_points;
+ number++) {
+ *outPtr = *(inPtr) - *(inPtr - 1);
+ if (*outPtr > bound)
+ *outPtr -= 2 * bound;
+ if (*outPtr < -bound)
+ *outPtr += 2 * bound;
+ inPtr++;
+ outPtr++;
+ }
+
+ *saveValue = inputVector[num_points - 1];
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
- if (num_points < 1) {
- return;
- }
- unsigned int number = 0;
- float* outPtr = outputVector;
- const float* inPtr = inputVector;
-
- // Do the first 1 by hand since we're going in from the saveValue:
- *outPtr = *inPtr - *saveValue;
- if (*outPtr > bound) *outPtr -= 2*bound;
- if (*outPtr < -bound) *outPtr += 2*bound;
- inPtr++;
- outPtr++;
-
- for (number = 1; number < num_points; number++) {
- *outPtr = *(inPtr) - *(inPtr-1);
- if (*outPtr > bound) *outPtr -= 2*bound;
- if (*outPtr < -bound) *outPtr += 2*bound;
+static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector,
+ const float* inputVector,
+ const float bound,
+ float* saveValue,
+ unsigned int num_points)
+{
+ if (num_points < 1) {
+ return;
+ }
+ unsigned int number = 0;
+ float* outPtr = outputVector;
+ const float* inPtr = inputVector;
+
+ // Do the first 1 by hand since we're going in from the saveValue:
+ *outPtr = *inPtr - *saveValue;
+ if (*outPtr > bound)
+ *outPtr -= 2 * bound;
+ if (*outPtr < -bound)
+ *outPtr += 2 * bound;
inPtr++;
outPtr++;
- }
- *saveValue = inputVector[num_points-1];
+ for (number = 1; number < num_points; number++) {
+ *outPtr = *(inPtr) - *(inPtr - 1);
+ if (*outPtr > bound)
+ *outPtr -= 2 * bound;
+ if (*outPtr < -bound)
+ *outPtr += 2 * bound;
+ inPtr++;
+ outPtr++;
+ }
+
+ *saveValue = inputVector[num_points - 1];
}
#endif /* LV_HAVE_GENERIC */
-
-
#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void volk_32f_s32f_32f_fm_detect_32f_u_avx(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
- if (num_points < 1) {
- return;
- }
- unsigned int number = 1;
- unsigned int j = 0;
- // num_points-1 keeps Fedora 7's gcc from crashing...
- // num_points won't work. :(
- const unsigned int eighthPoints = (num_points-1) / 8;
-
- float* outPtr = outputVector;
- const float* inPtr = inputVector;
- __m256 upperBound = _mm256_set1_ps(bound);
- __m256 lowerBound = _mm256_set1_ps(-bound);
- __m256 next3old1;
- __m256 next4;
- __m256 boundAdjust;
- __m256 posBoundAdjust = _mm256_set1_ps(-2*bound); // Subtract when we're above.
- __m256 negBoundAdjust = _mm256_set1_ps(2*bound); // Add when we're below.
- // Do the first 8 by hand since we're going in from the saveValue:
- *outPtr = *inPtr - *saveValue;
- if (*outPtr > bound) *outPtr -= 2*bound;
- if (*outPtr < -bound) *outPtr += 2*bound;
- inPtr++;
- outPtr++;
- for (j = 1; j < ( (8 < num_points) ? 8 : num_points); j++) {
- *outPtr = *(inPtr) - *(inPtr-1);
- if (*outPtr > bound) *outPtr -= 2*bound;
- if (*outPtr < -bound) *outPtr += 2*bound;
+static inline void volk_32f_s32f_32f_fm_detect_32f_u_avx(float* outputVector,
+ const float* inputVector,
+ const float bound,
+ float* saveValue,
+ unsigned int num_points)
+{
+ if (num_points < 1) {
+ return;
+ }
+ unsigned int number = 1;
+ unsigned int j = 0;
+ // num_points-1 keeps Fedora 7's gcc from crashing...
+ // num_points won't work. :(
+ const unsigned int eighthPoints = (num_points - 1) / 8;
+
+ float* outPtr = outputVector;
+ const float* inPtr = inputVector;
+ __m256 upperBound = _mm256_set1_ps(bound);
+ __m256 lowerBound = _mm256_set1_ps(-bound);
+ __m256 next3old1;
+ __m256 next4;
+ __m256 boundAdjust;
+ __m256 posBoundAdjust = _mm256_set1_ps(-2 * bound); // Subtract when we're above.
+ __m256 negBoundAdjust = _mm256_set1_ps(2 * bound); // Add when we're below.
+ // Do the first 8 by hand since we're going in from the saveValue:
+ *outPtr = *inPtr - *saveValue;
+ if (*outPtr > bound)
+ *outPtr -= 2 * bound;
+ if (*outPtr < -bound)
+ *outPtr += 2 * bound;
inPtr++;
outPtr++;
- }
-
- for (; number < eighthPoints; number++) {
- // Load data
- next3old1 = _mm256_loadu_ps((float*) (inPtr-1));
- next4 = _mm256_loadu_ps(inPtr);
- inPtr += 8;
- // Subtract and store:
- next3old1 = _mm256_sub_ps(next4, next3old1);
- // Bound:
- boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS);
- boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
- next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS);
- next4 = _mm256_and_ps(next4, negBoundAdjust);
- boundAdjust = _mm256_or_ps(next4, boundAdjust);
- // Make sure we're in the bounding interval:
- next3old1 = _mm256_add_ps(next3old1, boundAdjust);
- _mm256_storeu_ps(outPtr,next3old1); // Store the results back into the output
- outPtr += 8;
- }
-
- for (number = (8 > (eighthPoints*8) ? 8 : (8 * eighthPoints)); number < num_points; number++) {
- *outPtr = *(inPtr) - *(inPtr-1);
- if (*outPtr > bound) *outPtr -= 2*bound;
- if (*outPtr < -bound) *outPtr += 2*bound;
- inPtr++;
- outPtr++;
- }
-
- *saveValue = inputVector[num_points-1];
+ for (j = 1; j < ((8 < num_points) ? 8 : num_points); j++) {
+ *outPtr = *(inPtr) - *(inPtr - 1);
+ if (*outPtr > bound)
+ *outPtr -= 2 * bound;
+ if (*outPtr < -bound)
+ *outPtr += 2 * bound;
+ inPtr++;
+ outPtr++;
+ }
+
+ for (; number < eighthPoints; number++) {
+ // Load data
+ next3old1 = _mm256_loadu_ps((float*)(inPtr - 1));
+ next4 = _mm256_loadu_ps(inPtr);
+ inPtr += 8;
+ // Subtract and store:
+ next3old1 = _mm256_sub_ps(next4, next3old1);
+ // Bound:
+ boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS);
+ boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
+ next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS);
+ next4 = _mm256_and_ps(next4, negBoundAdjust);
+ boundAdjust = _mm256_or_ps(next4, boundAdjust);
+ // Make sure we're in the bounding interval:
+ next3old1 = _mm256_add_ps(next3old1, boundAdjust);
+ _mm256_storeu_ps(outPtr, next3old1); // Store the results back into the output
+ outPtr += 8;
+ }
+
+ for (number = (8 > (eighthPoints * 8) ? 8 : (8 * eighthPoints)); number < num_points;
+ number++) {
+ *outPtr = *(inPtr) - *(inPtr - 1);
+ if (*outPtr > bound)
+ *outPtr -= 2 * bound;
+ if (*outPtr < -bound)
+ *outPtr += 2 * bound;
+ inPtr++;
+ outPtr++;
+ }
+
+ *saveValue = inputVector[num_points - 1];
}
#endif /* LV_HAVE_AVX */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_s32f_calc_spectral_noise_floor_32f(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points)
- * \endcode
+ * void volk_32f_s32f_calc_spectral_noise_floor_32f(float* noiseFloorAmplitude, const
+ * float* realDataPoints, const float spectralExclusionValue, const unsigned int
+ * num_points) \endcode
*
* \b Inputs
* \li realDataPoints: The input power spectrum.
- * \li spectralExclusionValue: The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20.
- * \li num_points: The number of data points.
+ * \li spectralExclusionValue: The number of dB above the noise floor that a data point
+ * must be to be excluded from the noise floor calculation - default value is 20. \li
+ * num_points: The number of data points.
*
* \b Outputs
* \li noiseFloorAmplitude: The noise floor of the input spectrum, in dB.
#ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H
#define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H
-#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
const float spectralExclusionValue,
const unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- const float* dataPointsPtr = realDataPoints;
- __VOLK_ATTR_ALIGNED(32) float avgPointsVector[8];
-
- __m256 dataPointsVal;
- __m256 avgPointsVal = _mm256_setzero_ps();
- // Calculate the sum (for mean) for all points
- for(; number < eighthPoints; number++){
-
- dataPointsVal = _mm256_load_ps(dataPointsPtr);
-
- dataPointsPtr += 8;
-
- avgPointsVal = _mm256_add_ps(avgPointsVal, dataPointsVal);
- }
-
- _mm256_store_ps(avgPointsVector, avgPointsVal);
-
- float sumMean = 0.0;
- sumMean += avgPointsVector[0];
- sumMean += avgPointsVector[1];
- sumMean += avgPointsVector[2];
- sumMean += avgPointsVector[3];
- sumMean += avgPointsVector[4];
- sumMean += avgPointsVector[5];
- sumMean += avgPointsVector[6];
- sumMean += avgPointsVector[7];
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- sumMean += realDataPoints[number];
- }
-
- // calculate the spectral mean
- // +20 because for the comparison below we only want to throw out bins
- // that are significantly higher (and would, thus, affect the mean more
- const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
-
- dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
- __m256 vMeanAmplitudeVector = _mm256_set1_ps(meanAmplitude);
- __m256 vOnesVector = _mm256_set1_ps(1.0);
- __m256 vValidBinCount = _mm256_setzero_ps();
- avgPointsVal = _mm256_setzero_ps();
- __m256 compareMask;
- number = 0;
- // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
- for(; number < eighthPoints; number++){
-
- dataPointsVal = _mm256_load_ps(dataPointsPtr);
-
- dataPointsPtr += 8;
-
- // Identify which items do not exceed the mean amplitude
- compareMask = _mm256_cmp_ps(dataPointsVal, vMeanAmplitudeVector, _CMP_LE_OQ);
-
- // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude
- avgPointsVal = _mm256_add_ps(avgPointsVal, _mm256_and_ps(compareMask, dataPointsVal));
-
- // Count the number of bins which do not exceed the mean amplitude
- vValidBinCount = _mm256_add_ps(vValidBinCount, _mm256_and_ps(compareMask, vOnesVector));
- }
-
- // Calculate the mean from the remaining data points
- _mm256_store_ps(avgPointsVector, avgPointsVal);
-
- sumMean = 0.0;
- sumMean += avgPointsVector[0];
- sumMean += avgPointsVector[1];
- sumMean += avgPointsVector[2];
- sumMean += avgPointsVector[3];
- sumMean += avgPointsVector[4];
- sumMean += avgPointsVector[5];
- sumMean += avgPointsVector[6];
- sumMean += avgPointsVector[7];
-
- // Calculate the number of valid bins from the remaining count
- __VOLK_ATTR_ALIGNED(32) float validBinCountVector[8];
- _mm256_store_ps(validBinCountVector, vValidBinCount);
-
- float validBinCount = 0;
- validBinCount += validBinCountVector[0];
- validBinCount += validBinCountVector[1];
- validBinCount += validBinCountVector[2];
- validBinCount += validBinCountVector[3];
- validBinCount += validBinCountVector[4];
- validBinCount += validBinCountVector[5];
- validBinCount += validBinCountVector[6];
- validBinCount += validBinCountVector[7];
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- if(realDataPoints[number] <= meanAmplitude){
- sumMean += realDataPoints[number];
- validBinCount += 1.0;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* dataPointsPtr = realDataPoints;
+ __VOLK_ATTR_ALIGNED(32) float avgPointsVector[8];
+
+ __m256 dataPointsVal;
+ __m256 avgPointsVal = _mm256_setzero_ps();
+ // Calculate the sum (for mean) for all points
+ for (; number < eighthPoints; number++) {
+
+ dataPointsVal = _mm256_load_ps(dataPointsPtr);
+
+ dataPointsPtr += 8;
+
+ avgPointsVal = _mm256_add_ps(avgPointsVal, dataPointsVal);
}
- }
- float localNoiseFloorAmplitude = 0;
- if(validBinCount > 0.0){
- localNoiseFloorAmplitude = sumMean / validBinCount;
- }
- else{
- localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal...
- }
+ _mm256_store_ps(avgPointsVector, avgPointsVal);
+
+ float sumMean = 0.0;
+ sumMean += avgPointsVector[0];
+ sumMean += avgPointsVector[1];
+ sumMean += avgPointsVector[2];
+ sumMean += avgPointsVector[3];
+ sumMean += avgPointsVector[4];
+ sumMean += avgPointsVector[5];
+ sumMean += avgPointsVector[6];
+ sumMean += avgPointsVector[7];
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ sumMean += realDataPoints[number];
+ }
+
+ // calculate the spectral mean
+ // +20 because for the comparison below we only want to throw out bins
+ // that are significantly higher (and would, thus, affect the mean more
+ const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
+
+ dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
+ __m256 vMeanAmplitudeVector = _mm256_set1_ps(meanAmplitude);
+ __m256 vOnesVector = _mm256_set1_ps(1.0);
+ __m256 vValidBinCount = _mm256_setzero_ps();
+ avgPointsVal = _mm256_setzero_ps();
+ __m256 compareMask;
+ number = 0;
+ // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
+ for (; number < eighthPoints; number++) {
+
+ dataPointsVal = _mm256_load_ps(dataPointsPtr);
+
+ dataPointsPtr += 8;
+
+ // Identify which items do not exceed the mean amplitude
+ compareMask = _mm256_cmp_ps(dataPointsVal, vMeanAmplitudeVector, _CMP_LE_OQ);
+
+ // Mask off the items that exceed the mean amplitude and add the avg Points that
+ // do not exceed the mean amplitude
+ avgPointsVal =
+ _mm256_add_ps(avgPointsVal, _mm256_and_ps(compareMask, dataPointsVal));
+
+ // Count the number of bins which do not exceed the mean amplitude
+ vValidBinCount =
+ _mm256_add_ps(vValidBinCount, _mm256_and_ps(compareMask, vOnesVector));
+ }
- *noiseFloorAmplitude = localNoiseFloorAmplitude;
+ // Calculate the mean from the remaining data points
+ _mm256_store_ps(avgPointsVector, avgPointsVal);
+
+ sumMean = 0.0;
+ sumMean += avgPointsVector[0];
+ sumMean += avgPointsVector[1];
+ sumMean += avgPointsVector[2];
+ sumMean += avgPointsVector[3];
+ sumMean += avgPointsVector[4];
+ sumMean += avgPointsVector[5];
+ sumMean += avgPointsVector[6];
+ sumMean += avgPointsVector[7];
+
+ // Calculate the number of valid bins from the remaining count
+ __VOLK_ATTR_ALIGNED(32) float validBinCountVector[8];
+ _mm256_store_ps(validBinCountVector, vValidBinCount);
+
+ float validBinCount = 0;
+ validBinCount += validBinCountVector[0];
+ validBinCount += validBinCountVector[1];
+ validBinCount += validBinCountVector[2];
+ validBinCount += validBinCountVector[3];
+ validBinCount += validBinCountVector[4];
+ validBinCount += validBinCountVector[5];
+ validBinCount += validBinCountVector[6];
+ validBinCount += validBinCountVector[7];
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ if (realDataPoints[number] <= meanAmplitude) {
+ sumMean += realDataPoints[number];
+ validBinCount += 1.0;
+ }
+ }
+
+ float localNoiseFloorAmplitude = 0;
+ if (validBinCount > 0.0) {
+ localNoiseFloorAmplitude = sumMean / validBinCount;
+ } else {
+ localNoiseFloorAmplitude =
+ meanAmplitude; // For the odd case that all the amplitudes are equal...
+ }
+
+ *noiseFloorAmplitude = localNoiseFloorAmplitude;
}
#endif /* LV_HAVE_AVX */
const float spectralExclusionValue,
const unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- const float* dataPointsPtr = realDataPoints;
- __VOLK_ATTR_ALIGNED(16) float avgPointsVector[4];
-
- __m128 dataPointsVal;
- __m128 avgPointsVal = _mm_setzero_ps();
- // Calculate the sum (for mean) for all points
- for(; number < quarterPoints; number++){
-
- dataPointsVal = _mm_load_ps(dataPointsPtr);
-
- dataPointsPtr += 4;
-
- avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal);
- }
-
- _mm_store_ps(avgPointsVector, avgPointsVal);
-
- float sumMean = 0.0;
- sumMean += avgPointsVector[0];
- sumMean += avgPointsVector[1];
- sumMean += avgPointsVector[2];
- sumMean += avgPointsVector[3];
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- sumMean += realDataPoints[number];
- }
-
- // calculate the spectral mean
- // +20 because for the comparison below we only want to throw out bins
- // that are significantly higher (and would, thus, affect the mean more
- const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
-
- dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
- __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude);
- __m128 vOnesVector = _mm_set_ps1(1.0);
- __m128 vValidBinCount = _mm_setzero_ps();
- avgPointsVal = _mm_setzero_ps();
- __m128 compareMask;
- number = 0;
- // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
- for(; number < quarterPoints; number++){
-
- dataPointsVal = _mm_load_ps(dataPointsPtr);
-
- dataPointsPtr += 4;
-
- // Identify which items do not exceed the mean amplitude
- compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector);
-
- // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude
- avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal));
-
- // Count the number of bins which do not exceed the mean amplitude
- vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector));
- }
-
- // Calculate the mean from the remaining data points
- _mm_store_ps(avgPointsVector, avgPointsVal);
-
- sumMean = 0.0;
- sumMean += avgPointsVector[0];
- sumMean += avgPointsVector[1];
- sumMean += avgPointsVector[2];
- sumMean += avgPointsVector[3];
-
- // Calculate the number of valid bins from the remaining count
- __VOLK_ATTR_ALIGNED(16) float validBinCountVector[4];
- _mm_store_ps(validBinCountVector, vValidBinCount);
-
- float validBinCount = 0;
- validBinCount += validBinCountVector[0];
- validBinCount += validBinCountVector[1];
- validBinCount += validBinCountVector[2];
- validBinCount += validBinCountVector[3];
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- if(realDataPoints[number] <= meanAmplitude){
- sumMean += realDataPoints[number];
- validBinCount += 1.0;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* dataPointsPtr = realDataPoints;
+ __VOLK_ATTR_ALIGNED(16) float avgPointsVector[4];
+
+ __m128 dataPointsVal;
+ __m128 avgPointsVal = _mm_setzero_ps();
+ // Calculate the sum (for mean) for all points
+ for (; number < quarterPoints; number++) {
+
+ dataPointsVal = _mm_load_ps(dataPointsPtr);
+
+ dataPointsPtr += 4;
+
+ avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal);
+ }
+
+ _mm_store_ps(avgPointsVector, avgPointsVal);
+
+ float sumMean = 0.0;
+ sumMean += avgPointsVector[0];
+ sumMean += avgPointsVector[1];
+ sumMean += avgPointsVector[2];
+ sumMean += avgPointsVector[3];
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ sumMean += realDataPoints[number];
+ }
+
+ // calculate the spectral mean
+ // +20 because for the comparison below we only want to throw out bins
+ // that are significantly higher (and would, thus, affect the mean more
+ const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
+
+ dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
+ __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude);
+ __m128 vOnesVector = _mm_set_ps1(1.0);
+ __m128 vValidBinCount = _mm_setzero_ps();
+ avgPointsVal = _mm_setzero_ps();
+ __m128 compareMask;
+ number = 0;
+ // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
+ for (; number < quarterPoints; number++) {
+
+ dataPointsVal = _mm_load_ps(dataPointsPtr);
+
+ dataPointsPtr += 4;
+
+ // Identify which items do not exceed the mean amplitude
+ compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector);
+
+ // Mask off the items that exceed the mean amplitude and add the avg Points that
+ // do not exceed the mean amplitude
+ avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal));
+
+ // Count the number of bins which do not exceed the mean amplitude
+ vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector));
}
- }
- float localNoiseFloorAmplitude = 0;
- if(validBinCount > 0.0){
- localNoiseFloorAmplitude = sumMean / validBinCount;
- }
- else{
- localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal...
- }
+ // Calculate the mean from the remaining data points
+ _mm_store_ps(avgPointsVector, avgPointsVal);
+
+ sumMean = 0.0;
+ sumMean += avgPointsVector[0];
+ sumMean += avgPointsVector[1];
+ sumMean += avgPointsVector[2];
+ sumMean += avgPointsVector[3];
+
+ // Calculate the number of valid bins from the remaining count
+ __VOLK_ATTR_ALIGNED(16) float validBinCountVector[4];
+ _mm_store_ps(validBinCountVector, vValidBinCount);
+
+ float validBinCount = 0;
+ validBinCount += validBinCountVector[0];
+ validBinCount += validBinCountVector[1];
+ validBinCount += validBinCountVector[2];
+ validBinCount += validBinCountVector[3];
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ if (realDataPoints[number] <= meanAmplitude) {
+ sumMean += realDataPoints[number];
+ validBinCount += 1.0;
+ }
+ }
+
+ float localNoiseFloorAmplitude = 0;
+ if (validBinCount > 0.0) {
+ localNoiseFloorAmplitude = sumMean / validBinCount;
+ } else {
+ localNoiseFloorAmplitude =
+ meanAmplitude; // For the odd case that all the amplitudes are equal...
+ }
- *noiseFloorAmplitude = localNoiseFloorAmplitude;
+ *noiseFloorAmplitude = localNoiseFloorAmplitude;
}
#endif /* LV_HAVE_SSE */
const float spectralExclusionValue,
const unsigned int num_points)
{
- float sumMean = 0.0;
- unsigned int number;
- // find the sum (for mean), etc
- for(number = 0; number < num_points; number++){
- // sum (for mean)
- sumMean += realDataPoints[number];
- }
-
- // calculate the spectral mean
- // +20 because for the comparison below we only want to throw out bins
- // that are significantly higher (and would, thus, affect the mean more)
- const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue;
-
- // now throw out any bins higher than the mean
- sumMean = 0.0;
- unsigned int newNumDataPoints = num_points;
- for(number = 0; number < num_points; number++){
- if (realDataPoints[number] <= meanAmplitude)
- sumMean += realDataPoints[number];
- else
- newNumDataPoints--;
- }
+ float sumMean = 0.0;
+ unsigned int number;
+ // find the sum (for mean), etc
+ for (number = 0; number < num_points; number++) {
+ // sum (for mean)
+ sumMean += realDataPoints[number];
+ }
+
+ // calculate the spectral mean
+ // +20 because for the comparison below we only want to throw out bins
+ // that are significantly higher (and would, thus, affect the mean more)
+ const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue;
+
+ // now throw out any bins higher than the mean
+ sumMean = 0.0;
+ unsigned int newNumDataPoints = num_points;
+ for (number = 0; number < num_points; number++) {
+ if (realDataPoints[number] <= meanAmplitude)
+ sumMean += realDataPoints[number];
+ else
+ newNumDataPoints--;
+ }
- float localNoiseFloorAmplitude = 0.0;
- if (newNumDataPoints == 0) // in the odd case that all
- localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal!
- else
- localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints);
+ float localNoiseFloorAmplitude = 0.0;
+ if (newNumDataPoints == 0) // in the odd case that all
+ localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal!
+ else
+ localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints);
- *noiseFloorAmplitude = localNoiseFloorAmplitude;
+ *noiseFloorAmplitude = localNoiseFloorAmplitude;
}
#endif /* LV_HAVE_GENERIC */
#ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_u_H
#define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_u_H
-#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
const float spectralExclusionValue,
const unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- const float* dataPointsPtr = realDataPoints;
- __VOLK_ATTR_ALIGNED(16) float avgPointsVector[8];
-
- __m256 dataPointsVal;
- __m256 avgPointsVal = _mm256_setzero_ps();
- // Calculate the sum (for mean) for all points
- for(; number < eighthPoints; number++){
-
- dataPointsVal = _mm256_loadu_ps(dataPointsPtr);
-
- dataPointsPtr += 8;
-
- avgPointsVal = _mm256_add_ps(avgPointsVal, dataPointsVal);
- }
-
- _mm256_storeu_ps(avgPointsVector, avgPointsVal);
-
- float sumMean = 0.0;
- sumMean += avgPointsVector[0];
- sumMean += avgPointsVector[1];
- sumMean += avgPointsVector[2];
- sumMean += avgPointsVector[3];
- sumMean += avgPointsVector[4];
- sumMean += avgPointsVector[5];
- sumMean += avgPointsVector[6];
- sumMean += avgPointsVector[7];
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- sumMean += realDataPoints[number];
- }
-
- // calculate the spectral mean
- // +20 because for the comparison below we only want to throw out bins
- // that are significantly higher (and would, thus, affect the mean more
- const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
-
- dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
- __m256 vMeanAmplitudeVector = _mm256_set1_ps(meanAmplitude);
- __m256 vOnesVector = _mm256_set1_ps(1.0);
- __m256 vValidBinCount = _mm256_setzero_ps();
- avgPointsVal = _mm256_setzero_ps();
- __m256 compareMask;
- number = 0;
- // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
- for(; number < eighthPoints; number++){
-
- dataPointsVal = _mm256_loadu_ps(dataPointsPtr);
-
- dataPointsPtr += 8;
-
- // Identify which items do not exceed the mean amplitude
- compareMask = _mm256_cmp_ps(dataPointsVal, vMeanAmplitudeVector, _CMP_LE_OQ);
-
- // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude
- avgPointsVal = _mm256_add_ps(avgPointsVal, _mm256_and_ps(compareMask, dataPointsVal));
-
- // Count the number of bins which do not exceed the mean amplitude
- vValidBinCount = _mm256_add_ps(vValidBinCount, _mm256_and_ps(compareMask, vOnesVector));
- }
-
- // Calculate the mean from the remaining data points
- _mm256_storeu_ps(avgPointsVector, avgPointsVal);
-
- sumMean = 0.0;
- sumMean += avgPointsVector[0];
- sumMean += avgPointsVector[1];
- sumMean += avgPointsVector[2];
- sumMean += avgPointsVector[3];
- sumMean += avgPointsVector[4];
- sumMean += avgPointsVector[5];
- sumMean += avgPointsVector[6];
- sumMean += avgPointsVector[7];
-
- // Calculate the number of valid bins from the remaining count
- __VOLK_ATTR_ALIGNED(16) float validBinCountVector[8];
- _mm256_storeu_ps(validBinCountVector, vValidBinCount);
-
- float validBinCount = 0;
- validBinCount += validBinCountVector[0];
- validBinCount += validBinCountVector[1];
- validBinCount += validBinCountVector[2];
- validBinCount += validBinCountVector[3];
- validBinCount += validBinCountVector[4];
- validBinCount += validBinCountVector[5];
- validBinCount += validBinCountVector[6];
- validBinCount += validBinCountVector[7];
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- if(realDataPoints[number] <= meanAmplitude){
- sumMean += realDataPoints[number];
- validBinCount += 1.0;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* dataPointsPtr = realDataPoints;
+ __VOLK_ATTR_ALIGNED(16) float avgPointsVector[8];
+
+ __m256 dataPointsVal;
+ __m256 avgPointsVal = _mm256_setzero_ps();
+ // Calculate the sum (for mean) for all points
+ for (; number < eighthPoints; number++) {
+
+ dataPointsVal = _mm256_loadu_ps(dataPointsPtr);
+
+ dataPointsPtr += 8;
+
+ avgPointsVal = _mm256_add_ps(avgPointsVal, dataPointsVal);
+ }
+
+ _mm256_storeu_ps(avgPointsVector, avgPointsVal);
+
+ float sumMean = 0.0;
+ sumMean += avgPointsVector[0];
+ sumMean += avgPointsVector[1];
+ sumMean += avgPointsVector[2];
+ sumMean += avgPointsVector[3];
+ sumMean += avgPointsVector[4];
+ sumMean += avgPointsVector[5];
+ sumMean += avgPointsVector[6];
+ sumMean += avgPointsVector[7];
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ sumMean += realDataPoints[number];
+ }
+
+ // calculate the spectral mean
+ // +20 because for the comparison below we only want to throw out bins
+ // that are significantly higher (and would, thus, affect the mean more
+ const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
+
+ dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
+ __m256 vMeanAmplitudeVector = _mm256_set1_ps(meanAmplitude);
+ __m256 vOnesVector = _mm256_set1_ps(1.0);
+ __m256 vValidBinCount = _mm256_setzero_ps();
+ avgPointsVal = _mm256_setzero_ps();
+ __m256 compareMask;
+ number = 0;
+ // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
+ for (; number < eighthPoints; number++) {
+
+ dataPointsVal = _mm256_loadu_ps(dataPointsPtr);
+
+ dataPointsPtr += 8;
+
+ // Identify which items do not exceed the mean amplitude
+ compareMask = _mm256_cmp_ps(dataPointsVal, vMeanAmplitudeVector, _CMP_LE_OQ);
+
+ // Mask off the items that exceed the mean amplitude and add the avg Points that
+ // do not exceed the mean amplitude
+ avgPointsVal =
+ _mm256_add_ps(avgPointsVal, _mm256_and_ps(compareMask, dataPointsVal));
+
+ // Count the number of bins which do not exceed the mean amplitude
+ vValidBinCount =
+ _mm256_add_ps(vValidBinCount, _mm256_and_ps(compareMask, vOnesVector));
+ }
+
+ // Calculate the mean from the remaining data points
+ _mm256_storeu_ps(avgPointsVector, avgPointsVal);
+
+ sumMean = 0.0;
+ sumMean += avgPointsVector[0];
+ sumMean += avgPointsVector[1];
+ sumMean += avgPointsVector[2];
+ sumMean += avgPointsVector[3];
+ sumMean += avgPointsVector[4];
+ sumMean += avgPointsVector[5];
+ sumMean += avgPointsVector[6];
+ sumMean += avgPointsVector[7];
+
+ // Calculate the number of valid bins from the remaining count
+ __VOLK_ATTR_ALIGNED(16) float validBinCountVector[8];
+ _mm256_storeu_ps(validBinCountVector, vValidBinCount);
+
+ float validBinCount = 0;
+ validBinCount += validBinCountVector[0];
+ validBinCount += validBinCountVector[1];
+ validBinCount += validBinCountVector[2];
+ validBinCount += validBinCountVector[3];
+ validBinCount += validBinCountVector[4];
+ validBinCount += validBinCountVector[5];
+ validBinCount += validBinCountVector[6];
+ validBinCount += validBinCountVector[7];
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ if (realDataPoints[number] <= meanAmplitude) {
+ sumMean += realDataPoints[number];
+ validBinCount += 1.0;
+ }
}
- }
- float localNoiseFloorAmplitude = 0;
- if(validBinCount > 0.0){
- localNoiseFloorAmplitude = sumMean / validBinCount;
- }
- else{
- localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal...
- }
+ float localNoiseFloorAmplitude = 0;
+ if (validBinCount > 0.0) {
+ localNoiseFloorAmplitude = sumMean / validBinCount;
+ } else {
+ localNoiseFloorAmplitude =
+ meanAmplitude; // For the odd case that all the amplitudes are equal...
+ }
- *noiseFloorAmplitude = localNoiseFloorAmplitude;
+ *noiseFloorAmplitude = localNoiseFloorAmplitude;
}
#endif /* LV_HAVE_AVX */
#endif /* INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_u_H */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_s32f_convert_16i(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points)
- * \endcode
+ * void volk_32f_s32f_convert_16i(int16_t* outputVector, const float* inputVector, const
+ * float scalar, unsigned int num_points) \endcode
*
* \b Inputs
* \li inputVector: the input vector of floats.
* \li outputVector: The output vector.
*
* \b Example
- * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest delta
- * int N = 10;
- * unsigned int alignment = volk_get_alignment();
- * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
- * int16_t* out = (int16_t*)volk_malloc(sizeof(int16_t)*N, alignment);
+ * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest
+ * delta int N = 10; unsigned int alignment = volk_get_alignment(); float* increasing =
+ * (float*)volk_malloc(sizeof(float)*N, alignment); int16_t* out =
+ * (int16_t*)volk_malloc(sizeof(int16_t)*N, alignment);
*
* for(unsigned int ii = 0; ii < N; ++ii){
* increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
-
- const unsigned int sixteenthPoints = num_points / 16;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int16_t* outputVectorPtr = outputVector;
-
- float min_val = SHRT_MIN;
- float max_val = SHRT_MAX;
- float r;
-
- __m256 vScalar = _mm256_set1_ps(scalar);
- __m256 inputVal1, inputVal2;
- __m256i intInputVal1, intInputVal2;
- __m256 ret1, ret2;
- __m256 vmin_val = _mm256_set1_ps(min_val);
- __m256 vmax_val = _mm256_set1_ps(max_val);
-
- for(;number < sixteenthPoints; number++){
- inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
- inputVal2 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
-
- // Scale and clip
- ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
- ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
-
- intInputVal1 = _mm256_cvtps_epi32(ret1);
- intInputVal2 = _mm256_cvtps_epi32(ret2);
-
- intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
- intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
-
- _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- outputVector[number] = (int16_t)rintf(r);
- }
+ unsigned int number = 0;
+
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = SHRT_MIN;
+ float max_val = SHRT_MAX;
+ float r;
+
+ __m256 vScalar = _mm256_set1_ps(scalar);
+ __m256 inputVal1, inputVal2;
+ __m256i intInputVal1, intInputVal2;
+ __m256 ret1, ret2;
+ __m256 vmin_val = _mm256_set1_ps(min_val);
+ __m256 vmax_val = _mm256_set1_ps(max_val);
+
+ for (; number < sixteenthPoints; number++) {
+ inputVal1 = _mm256_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 8;
+ inputVal2 = _mm256_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 8;
+
+ // Scale and clip
+ ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
+ vmin_val);
+ ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
+ vmin_val);
+
+ intInputVal1 = _mm256_cvtps_epi32(ret1);
+ intInputVal2 = _mm256_cvtps_epi32(ret2);
+
+ intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+ intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+
+ _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ r = inputVector[number] * scalar;
+ if (r > max_val)
+ r = max_val;
+ else if (r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
+ unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ const unsigned int eighthPoints = num_points / 8;
- const float* inputVectorPtr = (const float*)inputVector;
- int16_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
- float min_val = SHRT_MIN;
- float max_val = SHRT_MAX;
- float r;
+ float min_val = SHRT_MIN;
+ float max_val = SHRT_MAX;
+ float r;
- __m256 vScalar = _mm256_set1_ps(scalar);
- __m256 inputVal, ret;
- __m256i intInputVal;
- __m128i intInputVal1, intInputVal2;
- __m256 vmin_val = _mm256_set1_ps(min_val);
- __m256 vmax_val = _mm256_set1_ps(max_val);
+ __m256 vScalar = _mm256_set1_ps(scalar);
+ __m256 inputVal, ret;
+ __m256i intInputVal;
+ __m128i intInputVal1, intInputVal2;
+ __m256 vmin_val = _mm256_set1_ps(min_val);
+ __m256 vmax_val = _mm256_set1_ps(max_val);
- for(;number < eighthPoints; number++){
- inputVal = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
+ for (; number < eighthPoints; number++) {
+ inputVal = _mm256_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 8;
- // Scale and clip
- ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), vmin_val);
+ // Scale and clip
+ ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
+ vmin_val);
- intInputVal = _mm256_cvtps_epi32(ret);
+ intInputVal = _mm256_cvtps_epi32(ret);
- intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
- intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
+ intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
+ intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
- _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 8;
- }
+ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- outputVector[number] = (int16_t)rintf(r);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ r = inputVector[number] * scalar;
+ if (r > max_val)
+ r = max_val;
+ else if (r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void
-volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
-
- const unsigned int eighthPoints = num_points / 8;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int16_t* outputVectorPtr = outputVector;
-
- float min_val = SHRT_MIN;
- float max_val = SHRT_MAX;
- float r;
-
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 inputVal1, inputVal2;
- __m128i intInputVal1, intInputVal2;
- __m128 ret1, ret2;
- __m128 vmin_val = _mm_set_ps1(min_val);
- __m128 vmax_val = _mm_set_ps1(max_val);
-
- for(;number < eighthPoints; number++){
- inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
- inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
- // Scale and clip
- ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
- ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
-
- intInputVal1 = _mm_cvtps_epi32(ret1);
- intInputVal2 = _mm_cvtps_epi32(ret2);
-
- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-
- _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- outputVector[number] = (int16_t)rintf(r);
- }
+ unsigned int number = 0;
+
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = SHRT_MIN;
+ float max_val = SHRT_MAX;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1, inputVal2;
+ __m128i intInputVal1, intInputVal2;
+ __m128 ret1, ret2;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for (; number < eighthPoints; number++) {
+ inputVal1 = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+ inputVal2 = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ // Scale and clip
+ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ r = inputVector[number] * scalar;
+ if (r > max_val)
+ r = max_val;
+ else if (r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
-
- const unsigned int quarterPoints = num_points / 4;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int16_t* outputVectorPtr = outputVector;
-
- float min_val = SHRT_MIN;
- float max_val = SHRT_MAX;
- float r;
-
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 ret;
- __m128 vmin_val = _mm_set_ps1(min_val);
- __m128 vmax_val = _mm_set_ps1(max_val);
-
- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
-
- for(;number < quarterPoints; number++){
- ret = _mm_loadu_ps(inputVectorPtr);
- inputVectorPtr += 4;
-
- // Scale and clip
- ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
-
- _mm_store_ps(outputFloatBuffer, ret);
- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- outputVector[number] = (int16_t)rintf(r);
- }
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = SHRT_MIN;
+ float max_val = SHRT_MAX;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for (; number < quarterPoints; number++) {
+ ret = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ // Scale and clip
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ r = inputVector[number] * scalar;
+ if (r > max_val)
+ r = max_val;
+ else if (r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_16i_generic(int16_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- int16_t* outputVectorPtr = outputVector;
- const float* inputVectorPtr = inputVector;
- unsigned int number = 0;
- float min_val = SHRT_MIN;
- float max_val = SHRT_MAX;
- float r;
-
- for(number = 0; number < num_points; number++){
- r = *inputVectorPtr++ * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- *outputVectorPtr++ = (int16_t)rintf(r);
- }
+ int16_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = SHRT_MIN;
+ float max_val = SHRT_MAX;
+ float r;
+
+ for (number = 0; number < num_points; number++) {
+ r = *inputVectorPtr++ * scalar;
+ if (r > max_val)
+ r = max_val;
+ else if (r < min_val)
+ r = min_val;
+ *outputVectorPtr++ = (int16_t)rintf(r);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H
#define INCLUDED_volk_32f_s32f_convert_16i_a_H
-#include <volk/volk_common.h>
#include <inttypes.h>
-#include <stdio.h>
#include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
-
- const unsigned int sixteenthPoints = num_points / 16;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int16_t* outputVectorPtr = outputVector;
-
- float min_val = SHRT_MIN;
- float max_val = SHRT_MAX;
- float r;
-
- __m256 vScalar = _mm256_set1_ps(scalar);
- __m256 inputVal1, inputVal2;
- __m256i intInputVal1, intInputVal2;
- __m256 ret1, ret2;
- __m256 vmin_val = _mm256_set1_ps(min_val);
- __m256 vmax_val = _mm256_set1_ps(max_val);
-
- for(;number < sixteenthPoints; number++){
- inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
- inputVal2 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
-
- // Scale and clip
- ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
- ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
-
- intInputVal1 = _mm256_cvtps_epi32(ret1);
- intInputVal2 = _mm256_cvtps_epi32(ret2);
-
- intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
- intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
-
- _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- outputVector[number] = (int16_t)rintf(r);
- }
+ unsigned int number = 0;
+
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = SHRT_MIN;
+ float max_val = SHRT_MAX;
+ float r;
+
+ __m256 vScalar = _mm256_set1_ps(scalar);
+ __m256 inputVal1, inputVal2;
+ __m256i intInputVal1, intInputVal2;
+ __m256 ret1, ret2;
+ __m256 vmin_val = _mm256_set1_ps(min_val);
+ __m256 vmax_val = _mm256_set1_ps(max_val);
+
+ for (; number < sixteenthPoints; number++) {
+ inputVal1 = _mm256_load_ps(inputVectorPtr);
+ inputVectorPtr += 8;
+ inputVal2 = _mm256_load_ps(inputVectorPtr);
+ inputVectorPtr += 8;
+
+ // Scale and clip
+ ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
+ vmin_val);
+ ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
+ vmin_val);
+
+ intInputVal1 = _mm256_cvtps_epi32(ret1);
+ intInputVal2 = _mm256_cvtps_epi32(ret2);
+
+ intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+ intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+
+ _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ r = inputVector[number] * scalar;
+ if (r > max_val)
+ r = max_val;
+ else if (r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_s32f_convert_16i_a_avx(int16_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_16i_a_avx(int16_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
+ unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ const unsigned int eighthPoints = num_points / 8;
- const float* inputVectorPtr = (const float*)inputVector;
- int16_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
- float min_val = SHRT_MIN;
- float max_val = SHRT_MAX;
- float r;
+ float min_val = SHRT_MIN;
+ float max_val = SHRT_MAX;
+ float r;
- __m256 vScalar = _mm256_set1_ps(scalar);
- __m256 inputVal, ret;
- __m256i intInputVal;
- __m128i intInputVal1, intInputVal2;
- __m256 vmin_val = _mm256_set1_ps(min_val);
- __m256 vmax_val = _mm256_set1_ps(max_val);
+ __m256 vScalar = _mm256_set1_ps(scalar);
+ __m256 inputVal, ret;
+ __m256i intInputVal;
+ __m128i intInputVal1, intInputVal2;
+ __m256 vmin_val = _mm256_set1_ps(min_val);
+ __m256 vmax_val = _mm256_set1_ps(max_val);
- for(;number < eighthPoints; number++){
- inputVal = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
+ for (; number < eighthPoints; number++) {
+ inputVal = _mm256_load_ps(inputVectorPtr);
+ inputVectorPtr += 8;
- // Scale and clip
- ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), vmin_val);
+ // Scale and clip
+ ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
+ vmin_val);
- intInputVal = _mm256_cvtps_epi32(ret);
+ intInputVal = _mm256_cvtps_epi32(ret);
- intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
- intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
+ intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
+ intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
- _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 8;
- }
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- outputVector[number] = (int16_t)rintf(r);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ r = inputVector[number] * scalar;
+ if (r > max_val)
+ r = max_val;
+ else if (r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void
-volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
-
- const unsigned int eighthPoints = num_points / 8;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int16_t* outputVectorPtr = outputVector;
-
- float min_val = SHRT_MIN;
- float max_val = SHRT_MAX;
- float r;
-
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 inputVal1, inputVal2;
- __m128i intInputVal1, intInputVal2;
- __m128 ret1, ret2;
- __m128 vmin_val = _mm_set_ps1(min_val);
- __m128 vmax_val = _mm_set_ps1(max_val);
-
- for(;number < eighthPoints; number++){
- inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
- inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
- // Scale and clip
- ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
- ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
-
- intInputVal1 = _mm_cvtps_epi32(ret1);
- intInputVal2 = _mm_cvtps_epi32(ret2);
-
- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-
- _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- outputVector[number] = (int16_t)rintf(r);
- }
+ unsigned int number = 0;
+
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = SHRT_MIN;
+ float max_val = SHRT_MAX;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1, inputVal2;
+ __m128i intInputVal1, intInputVal2;
+ __m128 ret1, ret2;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for (; number < eighthPoints; number++) {
+ inputVal1 = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+ inputVal2 = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ // Scale and clip
+ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ r = inputVector[number] * scalar;
+ if (r > max_val)
+ r = max_val;
+ else if (r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
-
- const unsigned int quarterPoints = num_points / 4;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int16_t* outputVectorPtr = outputVector;
-
- float min_val = SHRT_MIN;
- float max_val = SHRT_MAX;
- float r;
-
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 ret;
- __m128 vmin_val = _mm_set_ps1(min_val);
- __m128 vmax_val = _mm_set_ps1(max_val);
-
- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
-
- for(;number < quarterPoints; number++){
- ret = _mm_load_ps(inputVectorPtr);
- inputVectorPtr += 4;
-
- // Scale and clip
- ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
-
- _mm_store_ps(outputFloatBuffer, ret);
- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- outputVector[number] = (int16_t)rintf(r);
- }
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = SHRT_MIN;
+ float max_val = SHRT_MAX;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for (; number < quarterPoints; number++) {
+ ret = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ // Scale and clip
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ r = inputVector[number] * scalar;
+ if (r > max_val)
+ r = max_val;
+ else if (r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- int16_t* outputVectorPtr = outputVector;
- const float* inputVectorPtr = inputVector;
- unsigned int number = 0;
- float min_val = SHRT_MIN;
- float max_val = SHRT_MAX;
- float r;
-
- for(number = 0; number < num_points; number++){
- r = *inputVectorPtr++ * scalar;
- if(r < min_val)
- r = min_val;
- else if(r > max_val)
- r = max_val;
- *outputVectorPtr++ = (int16_t)rintf(r);
- }
+ int16_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = SHRT_MIN;
+ float max_val = SHRT_MAX;
+ float r;
+
+ for (number = 0; number < num_points; number++) {
+ r = *inputVectorPtr++ * scalar;
+ if (r < min_val)
+ r = min_val;
+ else if (r > max_val)
+ r = max_val;
+ *outputVectorPtr++ = (int16_t)rintf(r);
+ }
}
#endif /* LV_HAVE_GENERIC */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_s32f_convert_32i(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points)
- * \endcode
+ * void volk_32f_s32f_convert_32i(int32_t* outputVector, const float* inputVector, const
+ * float scalar, unsigned int num_points) \endcode
*
* \b Inputs
* \li inputVector: the input vector of floats.
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_s32f_convert_32i_u_avx(int32_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_32i_u_avx(int32_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
-
- const unsigned int eighthPoints = num_points / 8;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int32_t* outputVectorPtr = outputVector;
-
- float min_val = INT_MIN;
- float max_val = INT_MAX;
- float r;
-
- __m256 vScalar = _mm256_set1_ps(scalar);
- __m256 inputVal1;
- __m256i intInputVal1;
- __m256 vmin_val = _mm256_set1_ps(min_val);
- __m256 vmax_val = _mm256_set1_ps(max_val);
-
- for(;number < eighthPoints; number++){
- inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
-
- inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
- intInputVal1 = _mm256_cvtps_epi32(inputVal1);
-
- _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- outputVector[number] = (int32_t)rintf(r);
- }
+ unsigned int number = 0;
+
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+
+ float min_val = INT_MIN;
+ float max_val = INT_MAX;
+ float r;
+
+ __m256 vScalar = _mm256_set1_ps(scalar);
+ __m256 inputVal1;
+ __m256i intInputVal1;
+ __m256 vmin_val = _mm256_set1_ps(min_val);
+ __m256 vmax_val = _mm256_set1_ps(max_val);
+
+ for (; number < eighthPoints; number++) {
+ inputVal1 = _mm256_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 8;
+
+ inputVal1 = _mm256_max_ps(
+ _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ intInputVal1 = _mm256_cvtps_epi32(inputVal1);
+
+ _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ r = inputVector[number] * scalar;
+ if (r > max_val)
+ r = max_val;
+ else if (r < min_val)
+ r = min_val;
+ outputVector[number] = (int32_t)rintf(r);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void
-volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
-
- const unsigned int quarterPoints = num_points / 4;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int32_t* outputVectorPtr = outputVector;
-
- float min_val = INT_MIN;
- float max_val = INT_MAX;
- float r;
-
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 inputVal1;
- __m128i intInputVal1;
- __m128 vmin_val = _mm_set_ps1(min_val);
- __m128 vmax_val = _mm_set_ps1(max_val);
-
- for(;number < quarterPoints; number++){
- inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
- inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
- intInputVal1 = _mm_cvtps_epi32(inputVal1);
-
- _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- outputVector[number] = (int32_t)rintf(r);
- }
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+
+ float min_val = INT_MIN;
+ float max_val = INT_MAX;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1;
+ __m128i intInputVal1;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for (; number < quarterPoints; number++) {
+ inputVal1 = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ inputVal1 =
+ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ intInputVal1 = _mm_cvtps_epi32(inputVal1);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ r = inputVector[number] * scalar;
+ if (r > max_val)
+ r = max_val;
+ else if (r < min_val)
+ r = min_val;
+ outputVector[number] = (int32_t)rintf(r);
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
-
- const unsigned int quarterPoints = num_points / 4;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int32_t* outputVectorPtr = outputVector;
-
- float min_val = INT_MIN;
- float max_val = INT_MAX;
- float r;
-
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 ret;
- __m128 vmin_val = _mm_set_ps1(min_val);
- __m128 vmax_val = _mm_set_ps1(max_val);
-
- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
-
- for(;number < quarterPoints; number++){
- ret = _mm_loadu_ps(inputVectorPtr);
- inputVectorPtr += 4;
-
- ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
-
- _mm_store_ps(outputFloatBuffer, ret);
- *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
- *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
- *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
- *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- outputVector[number] = (int32_t)rintf(r);
- }
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+
+ float min_val = INT_MIN;
+ float max_val = INT_MAX;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for (; number < quarterPoints; number++) {
+ ret = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ r = inputVector[number] * scalar;
+ if (r > max_val)
+ r = max_val;
+ else if (r < min_val)
+ r = min_val;
+ outputVector[number] = (int32_t)rintf(r);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_s32f_convert_32i_generic(int32_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_32i_generic(int32_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- int32_t* outputVectorPtr = outputVector;
- const float* inputVectorPtr = inputVector;
- unsigned int number = 0;
- float min_val = INT_MIN;
- float max_val = INT_MAX;
- float r;
-
- for(number = 0; number < num_points; number++){
- r = *inputVectorPtr++ * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- *outputVectorPtr++ = (int32_t)rintf(r);
- }
+ int32_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = INT_MIN;
+ float max_val = INT_MAX;
+ float r;
+
+ for (number = 0; number < num_points; number++) {
+ r = *inputVectorPtr++ * scalar;
+ if (r > max_val)
+ r = max_val;
+ else if (r < min_val)
+ r = min_val;
+ *outputVectorPtr++ = (int32_t)rintf(r);
+ }
}
#endif /* LV_HAVE_GENERIC */
-
#endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */
#ifndef INCLUDED_volk_32f_s32f_convert_32i_a_H
#define INCLUDED_volk_32f_s32f_convert_32i_a_H
-#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
-
- const unsigned int eighthPoints = num_points / 8;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int32_t* outputVectorPtr = outputVector;
-
- float min_val = INT_MIN;
- float max_val = INT_MAX;
- float r;
-
- __m256 vScalar = _mm256_set1_ps(scalar);
- __m256 inputVal1;
- __m256i intInputVal1;
- __m256 vmin_val = _mm256_set1_ps(min_val);
- __m256 vmax_val = _mm256_set1_ps(max_val);
-
- for(;number < eighthPoints; number++){
- inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
-
- inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
- intInputVal1 = _mm256_cvtps_epi32(inputVal1);
-
- _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- outputVector[number] = (int32_t)rintf(r);
- }
+ unsigned int number = 0;
+
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+
+ float min_val = INT_MIN;
+ float max_val = INT_MAX;
+ float r;
+
+ __m256 vScalar = _mm256_set1_ps(scalar);
+ __m256 inputVal1;
+ __m256i intInputVal1;
+ __m256 vmin_val = _mm256_set1_ps(min_val);
+ __m256 vmax_val = _mm256_set1_ps(max_val);
+
+ for (; number < eighthPoints; number++) {
+ inputVal1 = _mm256_load_ps(inputVectorPtr);
+ inputVectorPtr += 8;
+
+ inputVal1 = _mm256_max_ps(
+ _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ intInputVal1 = _mm256_cvtps_epi32(inputVal1);
+
+ _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ r = inputVector[number] * scalar;
+ if (r > max_val)
+ r = max_val;
+ else if (r < min_val)
+ r = min_val;
+ outputVector[number] = (int32_t)rintf(r);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void
-volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
-
- const unsigned int quarterPoints = num_points / 4;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int32_t* outputVectorPtr = outputVector;
-
- float min_val = INT_MIN;
- float max_val = INT_MAX;
- float r;
-
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 inputVal1;
- __m128i intInputVal1;
- __m128 vmin_val = _mm_set_ps1(min_val);
- __m128 vmax_val = _mm_set_ps1(max_val);
-
- for(;number < quarterPoints; number++){
- inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
- inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
- intInputVal1 = _mm_cvtps_epi32(inputVal1);
-
- _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- outputVector[number] = (int32_t)rintf(r);
- }
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+
+ float min_val = INT_MIN;
+ float max_val = INT_MAX;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1;
+ __m128i intInputVal1;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for (; number < quarterPoints; number++) {
+ inputVal1 = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ inputVal1 =
+ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ intInputVal1 = _mm_cvtps_epi32(inputVal1);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ r = inputVector[number] * scalar;
+ if (r > max_val)
+ r = max_val;
+ else if (r < min_val)
+ r = min_val;
+ outputVector[number] = (int32_t)rintf(r);
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
-
- const unsigned int quarterPoints = num_points / 4;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int32_t* outputVectorPtr = outputVector;
-
- float min_val = INT_MIN;
- float max_val = INT_MAX;
- float r;
-
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 ret;
- __m128 vmin_val = _mm_set_ps1(min_val);
- __m128 vmax_val = _mm_set_ps1(max_val);
-
- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
-
- for(;number < quarterPoints; number++){
- ret = _mm_load_ps(inputVectorPtr);
- inputVectorPtr += 4;
-
- ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
-
- _mm_store_ps(outputFloatBuffer, ret);
- *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
- *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
- *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
- *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- outputVector[number] = (int32_t)rintf(r);
- }
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+
+ float min_val = INT_MIN;
+ float max_val = INT_MAX;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for (; number < quarterPoints; number++) {
+ ret = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ r = inputVector[number] * scalar;
+ if (r > max_val)
+ r = max_val;
+ else if (r < min_val)
+ r = min_val;
+ outputVector[number] = (int32_t)rintf(r);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- int32_t* outputVectorPtr = outputVector;
- const float* inputVectorPtr = inputVector;
- unsigned int number = 0;
- float min_val = INT_MIN;
- float max_val = INT_MAX;
- float r;
-
- for(number = 0; number < num_points; number++){
- r = *inputVectorPtr++ * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- *outputVectorPtr++ = (int32_t)rintf(r);
- }
+ int32_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = INT_MIN;
+ float max_val = INT_MAX;
+ float r;
+
+ for (number = 0; number < num_points; number++) {
+ r = *inputVectorPtr++ * scalar;
+ if (r > max_val)
+ r = max_val;
+ else if (r < min_val)
+ r = min_val;
+ *outputVectorPtr++ = (int32_t)rintf(r);
+ }
}
#endif /* LV_HAVE_GENERIC */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_s32f_convert_8i(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points)
+ * void volk_32f_s32f_convert_8i(int8_t* outputVector, const float* inputVector, const
+ float scalar, unsigned int num_points)
* \endcode
*
* \b Inputs
* \li outputVector: The output vector.
*
* \b Example
- * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest delta
+ * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest
+ delta
* int N = 10;
* unsigned int alignment = volk_get_alignment();
* float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
#include <inttypes.h>
#include <stdio.h>
-static inline void
-volk_32f_s32f_convert_8i_single(int8_t* out, const float in){
- float min_val = CHAR_MIN;
- float max_val = CHAR_MAX;
- if(in > max_val){
- *out = (int8_t)(max_val);
- }else if(in < min_val){
- *out = (int8_t)(min_val);
- }else{
- *out = (int8_t)(rintf(in));
- }
+static inline void volk_32f_s32f_convert_8i_single(int8_t* out, const float in)
+{
+ float min_val = CHAR_MIN;
+ float max_val = CHAR_MAX;
+ if (in > max_val) {
+ *out = (int8_t)(max_val);
+ } else if (in < min_val) {
+ *out = (int8_t)(min_val);
+ } else {
+ *out = (int8_t)(rintf(in));
+ }
}
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
-
- const unsigned int thirtysecondPoints = num_points / 32;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int8_t* outputVectorPtr = outputVector;
-
- float min_val = CHAR_MIN;
- float max_val = CHAR_MAX;
- float r;
-
- __m256 vScalar = _mm256_set1_ps(scalar);
- __m256 inputVal1, inputVal2, inputVal3, inputVal4;
- __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
- __m256 vmin_val = _mm256_set1_ps(min_val);
- __m256 vmax_val = _mm256_set1_ps(max_val);
- __m256i intInputVal;
-
- for(;number < thirtysecondPoints; number++){
- inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
- inputVal2 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
- inputVal3 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
- inputVal4 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
-
- inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
- inputVal2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
- inputVal3 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
- inputVal4 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
-
- intInputVal1 = _mm256_cvtps_epi32(inputVal1);
- intInputVal2 = _mm256_cvtps_epi32(inputVal2);
- intInputVal3 = _mm256_cvtps_epi32(inputVal3);
- intInputVal4 = _mm256_cvtps_epi32(inputVal4);
-
- intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
- intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
- intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
- intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
-
- intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
- intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
-
- _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
- outputVectorPtr += 32;
- }
-
- number = thirtysecondPoints * 32;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- volk_32f_s32f_convert_8i_single(&outputVector[number], r);
- }
+ unsigned int number = 0;
+
+ const unsigned int thirtysecondPoints = num_points / 32;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int8_t* outputVectorPtr = outputVector;
+
+ float min_val = CHAR_MIN;
+ float max_val = CHAR_MAX;
+ float r;
+
+ __m256 vScalar = _mm256_set1_ps(scalar);
+ __m256 inputVal1, inputVal2, inputVal3, inputVal4;
+ __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+ __m256 vmin_val = _mm256_set1_ps(min_val);
+ __m256 vmax_val = _mm256_set1_ps(max_val);
+ __m256i intInputVal;
+
+ for (; number < thirtysecondPoints; number++) {
+ inputVal1 = _mm256_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 8;
+ inputVal2 = _mm256_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 8;
+ inputVal3 = _mm256_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 8;
+ inputVal4 = _mm256_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 8;
+
+ inputVal1 = _mm256_max_ps(
+ _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ inputVal2 = _mm256_max_ps(
+ _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+ inputVal3 = _mm256_max_ps(
+ _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+ inputVal4 = _mm256_max_ps(
+ _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+
+ intInputVal1 = _mm256_cvtps_epi32(inputVal1);
+ intInputVal2 = _mm256_cvtps_epi32(inputVal2);
+ intInputVal3 = _mm256_cvtps_epi32(inputVal3);
+ intInputVal4 = _mm256_cvtps_epi32(inputVal4);
+
+ intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+ intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+ intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
+ intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
+
+ intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
+ intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+
+ _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
+ outputVectorPtr += 32;
+ }
+
+ number = thirtysecondPoints * 32;
+ for (; number < num_points; number++) {
+ r = inputVector[number] * scalar;
+ volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void
-volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
-
- const unsigned int sixteenthPoints = num_points / 16;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int8_t* outputVectorPtr = outputVector;
-
- float min_val = CHAR_MIN;
- float max_val = CHAR_MAX;
- float r;
-
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 inputVal1, inputVal2, inputVal3, inputVal4;
- __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
- __m128 vmin_val = _mm_set_ps1(min_val);
- __m128 vmax_val = _mm_set_ps1(max_val);
-
- for(;number < sixteenthPoints; number++){
- inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
- inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
- inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
- inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
- inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
- inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
- inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
- inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
-
- intInputVal1 = _mm_cvtps_epi32(inputVal1);
- intInputVal2 = _mm_cvtps_epi32(inputVal2);
- intInputVal3 = _mm_cvtps_epi32(inputVal3);
- intInputVal4 = _mm_cvtps_epi32(inputVal4);
-
- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
- intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
-
- intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
-
- _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 16;
- }
+ unsigned int number = 0;
+
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int8_t* outputVectorPtr = outputVector;
+
+ float min_val = CHAR_MIN;
+ float max_val = CHAR_MAX;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for (; number < sixteenthPoints; number++) {
+ inputVal1 = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+ inputVal2 = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+ inputVal3 = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+ inputVal4 = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ inputVal1 =
+ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ inputVal2 =
+ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+ inputVal3 =
+ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+ inputVal4 =
+ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(inputVal1);
+ intInputVal2 = _mm_cvtps_epi32(inputVal2);
+ intInputVal3 = _mm_cvtps_epi32(inputVal3);
+ intInputVal4 = _mm_cvtps_epi32(inputVal4);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+ intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+
+ intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 16;
+ }
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- volk_32f_s32f_convert_8i_single(&outputVector[number], r);
- }
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ r = inputVector[number] * scalar;
+ volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- size_t inner_loop;
+ unsigned int number = 0;
+ size_t inner_loop;
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int quarterPoints = num_points / 4;
- const float* inputVectorPtr = (const float*)inputVector;
- int8_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = (const float*)inputVector;
+ int8_t* outputVectorPtr = outputVector;
- float min_val = CHAR_MIN;
- float max_val = CHAR_MAX;
- float r;
+ float min_val = CHAR_MIN;
+ float max_val = CHAR_MAX;
+ float r;
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 ret;
- __m128 vmin_val = _mm_set_ps1(min_val);
- __m128 vmax_val = _mm_set_ps1(max_val);
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
- for(;number < quarterPoints; number++){
- ret = _mm_loadu_ps(inputVectorPtr);
- inputVectorPtr += 4;
+ for (; number < quarterPoints; number++) {
+ ret = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
- ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
- _mm_store_ps(outputFloatBuffer, ret);
- for (inner_loop = 0; inner_loop < 4; inner_loop++){
- *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
+ _mm_store_ps(outputFloatBuffer, ret);
+ for (inner_loop = 0; inner_loop < 4; inner_loop++) {
+ *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
+ }
}
- }
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- volk_32f_s32f_convert_8i_single(&outputVector[number], r);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ r = inputVector[number] * scalar;
+ volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_s32f_convert_8i_generic(int8_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_8i_generic(int8_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- const float* inputVectorPtr = inputVector;
- unsigned int number = 0;
- float r;
-
- for(number = 0; number < num_points; number++){
- r = *inputVectorPtr++ * scalar;
- volk_32f_s32f_convert_8i_single(&outputVector[number], r);
- }
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float r;
+
+ for (number = 0; number < num_points; number++) {
+ r = *inputVectorPtr++ * scalar;
+ volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
#define INCLUDED_volk_32f_s32f_convert_8i_a_H
-#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
-
- const unsigned int thirtysecondPoints = num_points / 32;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int8_t* outputVectorPtr = outputVector;
-
- float min_val = CHAR_MIN;
- float max_val = CHAR_MAX;
- float r;
-
- __m256 vScalar = _mm256_set1_ps(scalar);
- __m256 inputVal1, inputVal2, inputVal3, inputVal4;
- __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
- __m256 vmin_val = _mm256_set1_ps(min_val);
- __m256 vmax_val = _mm256_set1_ps(max_val);
- __m256i intInputVal;
-
- for(;number < thirtysecondPoints; number++){
- inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
- inputVal2 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
- inputVal3 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
- inputVal4 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
-
- inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
- inputVal2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
- inputVal3 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
- inputVal4 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
-
- intInputVal1 = _mm256_cvtps_epi32(inputVal1);
- intInputVal2 = _mm256_cvtps_epi32(inputVal2);
- intInputVal3 = _mm256_cvtps_epi32(inputVal3);
- intInputVal4 = _mm256_cvtps_epi32(inputVal4);
-
- intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
- intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
- intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
- intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
-
- intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
- intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
-
- _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
- outputVectorPtr += 32;
- }
-
- number = thirtysecondPoints * 32;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- volk_32f_s32f_convert_8i_single(&outputVector[number], r);
- }
+ unsigned int number = 0;
+
+ const unsigned int thirtysecondPoints = num_points / 32;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int8_t* outputVectorPtr = outputVector;
+
+ float min_val = CHAR_MIN;
+ float max_val = CHAR_MAX;
+ float r;
+
+ __m256 vScalar = _mm256_set1_ps(scalar);
+ __m256 inputVal1, inputVal2, inputVal3, inputVal4;
+ __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+ __m256 vmin_val = _mm256_set1_ps(min_val);
+ __m256 vmax_val = _mm256_set1_ps(max_val);
+ __m256i intInputVal;
+
+ for (; number < thirtysecondPoints; number++) {
+ inputVal1 = _mm256_load_ps(inputVectorPtr);
+ inputVectorPtr += 8;
+ inputVal2 = _mm256_load_ps(inputVectorPtr);
+ inputVectorPtr += 8;
+ inputVal3 = _mm256_load_ps(inputVectorPtr);
+ inputVectorPtr += 8;
+ inputVal4 = _mm256_load_ps(inputVectorPtr);
+ inputVectorPtr += 8;
+
+ inputVal1 = _mm256_max_ps(
+ _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ inputVal2 = _mm256_max_ps(
+ _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+ inputVal3 = _mm256_max_ps(
+ _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+ inputVal4 = _mm256_max_ps(
+ _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+
+ intInputVal1 = _mm256_cvtps_epi32(inputVal1);
+ intInputVal2 = _mm256_cvtps_epi32(inputVal2);
+ intInputVal3 = _mm256_cvtps_epi32(inputVal3);
+ intInputVal4 = _mm256_cvtps_epi32(inputVal4);
+
+ intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+ intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+ intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
+ intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
+
+ intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
+ intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+
+ _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
+ outputVectorPtr += 32;
+ }
+
+ number = thirtysecondPoints * 32;
+ for (; number < num_points; number++) {
+ r = inputVector[number] * scalar;
+ volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void
-volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
-
- const unsigned int sixteenthPoints = num_points / 16;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int8_t* outputVectorPtr = outputVector;
-
- float min_val = CHAR_MIN;
- float max_val = CHAR_MAX;
- float r;
-
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 inputVal1, inputVal2, inputVal3, inputVal4;
- __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
- __m128 vmin_val = _mm_set_ps1(min_val);
- __m128 vmax_val = _mm_set_ps1(max_val);
-
- for(;number < sixteenthPoints; number++){
- inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
- inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
- inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
- inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
- inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
- inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
- inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
- inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
-
- intInputVal1 = _mm_cvtps_epi32(inputVal1);
- intInputVal2 = _mm_cvtps_epi32(inputVal2);
- intInputVal3 = _mm_cvtps_epi32(inputVal3);
- intInputVal4 = _mm_cvtps_epi32(inputVal4);
-
- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
- intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
-
- intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
-
- _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 16;
- }
+ unsigned int number = 0;
+
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int8_t* outputVectorPtr = outputVector;
+
+ float min_val = CHAR_MIN;
+ float max_val = CHAR_MAX;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for (; number < sixteenthPoints; number++) {
+ inputVal1 = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+ inputVal2 = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+ inputVal3 = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+ inputVal4 = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ inputVal1 =
+ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ inputVal2 =
+ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+ inputVal3 =
+ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+ inputVal4 =
+ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(inputVal1);
+ intInputVal2 = _mm_cvtps_epi32(inputVal2);
+ intInputVal3 = _mm_cvtps_epi32(inputVal3);
+ intInputVal4 = _mm_cvtps_epi32(inputVal4);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+ intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+
+ intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 16;
+ }
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- volk_32f_s32f_convert_8i_single(&outputVector[number], r);
- }
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ r = inputVector[number] * scalar;
+ volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- size_t inner_loop;
+ unsigned int number = 0;
+ size_t inner_loop;
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int quarterPoints = num_points / 4;
- const float* inputVectorPtr = (const float*)inputVector;
+ const float* inputVectorPtr = (const float*)inputVector;
- float min_val = CHAR_MIN;
- float max_val = CHAR_MAX;
- float r;
+ float min_val = CHAR_MIN;
+ float max_val = CHAR_MAX;
+ float r;
- int8_t* outputVectorPtr = outputVector;
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 ret;
- __m128 vmin_val = _mm_set_ps1(min_val);
- __m128 vmax_val = _mm_set_ps1(max_val);
+ int8_t* outputVectorPtr = outputVector;
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
- for(;number < quarterPoints; number++){
- ret = _mm_load_ps(inputVectorPtr);
- inputVectorPtr += 4;
+ for (; number < quarterPoints; number++) {
+ ret = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
- ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
- _mm_store_ps(outputFloatBuffer, ret);
- for (inner_loop = 0; inner_loop < 4; inner_loop++){
- *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
+ _mm_store_ps(outputFloatBuffer, ret);
+ for (inner_loop = 0; inner_loop < 4; inner_loop++) {
+ *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
+ }
}
- }
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- volk_32f_s32f_convert_8i_single(&outputVector[number], r);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ r = inputVector[number] * scalar;
+ volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, const float* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- const float* inputVectorPtr = inputVector;
- unsigned int number = 0;
- float r;
-
- for(number = 0; number < num_points; number++){
- r = *inputVectorPtr++ * scalar;
- volk_32f_s32f_convert_8i_single(&outputVector[number], r);
- }
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float r;
+
+ for (number = 0; number < num_points; number++) {
+ r = *inputVectorPtr++ * scalar;
+ volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+ }
}
#endif /* LV_HAVE_GENERIC */
#include <volk/volk_32f_s32f_s32f_mod_range_32f.h>
#ifdef LV_HAVE_GENERIC
-static inline void volk_32f_s32f_mod_rangepuppet_32f_generic(float *output, const float *input, float bound, unsigned int num_points){
- volk_32f_s32f_s32f_mod_range_32f_generic(output, input, bound-3.141f, bound, num_points);
+static inline void volk_32f_s32f_mod_rangepuppet_32f_generic(float* output,
+ const float* input,
+ float bound,
+ unsigned int num_points)
+{
+ volk_32f_s32f_s32f_mod_range_32f_generic(
+ output, input, bound - 3.141f, bound, num_points);
}
#endif
#ifdef LV_HAVE_SSE
-static inline void volk_32f_s32f_mod_rangepuppet_32f_u_sse(float *output, const float *input, float bound, unsigned int num_points){
- volk_32f_s32f_s32f_mod_range_32f_u_sse(output, input, bound-3.141f, bound, num_points);
+static inline void volk_32f_s32f_mod_rangepuppet_32f_u_sse(float* output,
+ const float* input,
+ float bound,
+ unsigned int num_points)
+{
+ volk_32f_s32f_s32f_mod_range_32f_u_sse(
+ output, input, bound - 3.141f, bound, num_points);
}
#endif
#ifdef LV_HAVE_SSE
-static inline void volk_32f_s32f_mod_rangepuppet_32f_a_sse(float *output, const float *input, float bound, unsigned int num_points){
- volk_32f_s32f_s32f_mod_range_32f_a_sse(output, input, bound-3.141f, bound, num_points);
+static inline void volk_32f_s32f_mod_rangepuppet_32f_a_sse(float* output,
+ const float* input,
+ float bound,
+ unsigned int num_points)
+{
+ volk_32f_s32f_s32f_mod_range_32f_a_sse(
+ output, input, bound - 3.141f, bound, num_points);
}
#endif
#ifdef LV_HAVE_SSE2
-static inline void volk_32f_s32f_mod_rangepuppet_32f_u_sse2(float *output, const float *input, float bound, unsigned int num_points){
- volk_32f_s32f_s32f_mod_range_32f_u_sse2(output, input, bound-3.141f, bound, num_points);
+static inline void volk_32f_s32f_mod_rangepuppet_32f_u_sse2(float* output,
+ const float* input,
+ float bound,
+ unsigned int num_points)
+{
+ volk_32f_s32f_s32f_mod_range_32f_u_sse2(
+ output, input, bound - 3.141f, bound, num_points);
}
#endif
#ifdef LV_HAVE_SSE2
-static inline void volk_32f_s32f_mod_rangepuppet_32f_a_sse2(float *output, const float *input, float bound, unsigned int num_points){
- volk_32f_s32f_s32f_mod_range_32f_a_sse2(output, input, bound-3.141f, bound, num_points);
+static inline void volk_32f_s32f_mod_rangepuppet_32f_a_sse2(float* output,
+ const float* input,
+ float bound,
+ unsigned int num_points)
+{
+ volk_32f_s32f_s32f_mod_range_32f_a_sse2(
+ output, input, bound - 3.141f, bound, num_points);
}
#endif
#ifdef LV_HAVE_AVX
-static inline void volk_32f_s32f_mod_rangepuppet_32f_u_avx(float *output, const float *input, float bound, unsigned int num_points){
- volk_32f_s32f_s32f_mod_range_32f_u_avx(output, input, bound-3.141f, bound, num_points);
+static inline void volk_32f_s32f_mod_rangepuppet_32f_u_avx(float* output,
+ const float* input,
+ float bound,
+ unsigned int num_points)
+{
+ volk_32f_s32f_s32f_mod_range_32f_u_avx(
+ output, input, bound - 3.141f, bound, num_points);
}
#endif
#ifdef LV_HAVE_AVX
-static inline void volk_32f_s32f_mod_rangepuppet_32f_a_avx(float *output, const float *input, float bound, unsigned int num_points){
- volk_32f_s32f_s32f_mod_range_32f_a_avx(output, input, bound-3.141f, bound, num_points);
+static inline void volk_32f_s32f_mod_rangepuppet_32f_a_avx(float* output,
+ const float* input,
+ float bound,
+ unsigned int num_points)
+{
+ volk_32f_s32f_s32f_mod_range_32f_a_avx(
+ output, input, bound - 3.141f, bound, num_points);
}
#endif
#endif
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_s32f_multiply_32f(float* cVector, const float* aVector, const float scalar, unsigned int num_points)
- * \endcode
+ * void volk_32f_s32f_multiply_32f(float* cVector, const float* aVector, const float
+ * scalar, unsigned int num_points) \endcode
*
* \b Inputs
* \li aVector: The input vector of floats.
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float* aVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector,
+ const float* aVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- float* cPtr = cVector;
- const float* aPtr = aVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
- __m128 aVal, bVal, cVal;
- bVal = _mm_set_ps1(scalar);
- for(;number < quarterPoints; number++){
- aVal = _mm_loadu_ps(aPtr);
+ __m128 aVal, bVal, cVal;
+ bVal = _mm_set_ps1(scalar);
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_loadu_ps(aPtr);
- cVal = _mm_mul_ps(aVal, bVal);
+ cVal = _mm_mul_ps(aVal, bVal);
- _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+ _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ cPtr += 4;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) * scalar;
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) * scalar;
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float* aVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector,
+ const float* aVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- float* cPtr = cVector;
- const float* aPtr = aVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
- __m256 aVal, bVal, cVal;
- bVal = _mm256_set1_ps(scalar);
- for(;number < eighthPoints; number++){
+ __m256 aVal, bVal, cVal;
+ bVal = _mm256_set1_ps(scalar);
+ for (; number < eighthPoints; number++) {
- aVal = _mm256_loadu_ps(aPtr);
+ aVal = _mm256_loadu_ps(aPtr);
- cVal = _mm256_mul_ps(aVal, bVal);
+ cVal = _mm256_mul_ps(aVal, bVal);
- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ cPtr += 8;
+ }
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) * scalar;
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) * scalar;
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_s32f_multiply_32f_generic(float* cVector, const float* aVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_multiply_32f_generic(float* cVector,
+ const float* aVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const float* inputPtr = aVector;
- float* outputPtr = cVector;
- for(number = 0; number < num_points; number++){
- *outputPtr = (*inputPtr) * scalar;
- inputPtr++;
- outputPtr++;
- }
+ unsigned int number = 0;
+ const float* inputPtr = aVector;
+ float* outputPtr = cVector;
+ for (number = 0; number < num_points; number++) {
+ *outputPtr = (*inputPtr) * scalar;
+ inputPtr++;
+ outputPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32f_s32f_multiply_32f_a_sse(float* cVector, const float* aVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector,
+ const float* aVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- float* cPtr = cVector;
- const float* aPtr = aVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
- __m128 aVal, bVal, cVal;
- bVal = _mm_set_ps1(scalar);
- for(;number < quarterPoints; number++){
- aVal = _mm_load_ps(aPtr);
+ __m128 aVal, bVal, cVal;
+ bVal = _mm_set_ps1(scalar);
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr);
- cVal = _mm_mul_ps(aVal, bVal);
+ cVal = _mm_mul_ps(aVal, bVal);
- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ cPtr += 4;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) * scalar;
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) * scalar;
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_s32f_multiply_32f_a_avx(float* cVector, const float* aVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector,
+ const float* aVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- float* cPtr = cVector;
- const float* aPtr = aVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
- __m256 aVal, bVal, cVal;
- bVal = _mm256_set1_ps(scalar);
- for(;number < eighthPoints; number++){
- aVal = _mm256_load_ps(aPtr);
+ __m256 aVal, bVal, cVal;
+ bVal = _mm256_set1_ps(scalar);
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_load_ps(aPtr);
- cVal = _mm256_mul_ps(aVal, bVal);
+ cVal = _mm256_mul_ps(aVal, bVal);
- _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ cPtr += 8;
+ }
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) * scalar;
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) * scalar;
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_32f_s32f_multiply_32f_u_neon(float* cVector, const float* aVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_multiply_32f_u_neon(float* cVector,
+ const float* aVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const float* inputPtr = aVector;
- float* outputPtr = cVector;
- const unsigned int quarterPoints = num_points / 4;
-
- float32x4_t aVal, cVal;
-
- for(number = 0; number < quarterPoints; number++){
- aVal = vld1q_f32(inputPtr); // Load into NEON regs
- cVal = vmulq_n_f32 (aVal, scalar); // Do the multiply
- vst1q_f32(outputPtr, cVal); // Store results back to output
- inputPtr += 4;
- outputPtr += 4;
- }
- for(number = quarterPoints * 4; number < num_points; number++){
- *outputPtr++ = (*inputPtr++) * scalar;
- }
+ unsigned int number = 0;
+ const float* inputPtr = aVector;
+ float* outputPtr = cVector;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float32x4_t aVal, cVal;
+
+ for (number = 0; number < quarterPoints; number++) {
+ aVal = vld1q_f32(inputPtr); // Load into NEON regs
+ cVal = vmulq_n_f32(aVal, scalar); // Do the multiply
+ vst1q_f32(outputPtr, cVal); // Store results back to output
+ inputPtr += 4;
+ outputPtr += 4;
+ }
+ for (number = quarterPoints * 4; number < num_points; number++) {
+ *outputPtr++ = (*inputPtr++) * scalar;
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_s32f_multiply_32f_a_generic(float* cVector, const float* aVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_multiply_32f_a_generic(float* cVector,
+ const float* aVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const float* inputPtr = aVector;
- float* outputPtr = cVector;
- for(number = 0; number < num_points; number++){
- *outputPtr = (*inputPtr) * scalar;
- inputPtr++;
- outputPtr++;
- }
+ unsigned int number = 0;
+ const float* inputPtr = aVector;
+ float* outputPtr = cVector;
+ for (number = 0; number < num_points; number++) {
+ *outputPtr = (*inputPtr) * scalar;
+ inputPtr++;
+ outputPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC
-extern void
-volk_32f_s32f_multiply_32f_a_orc_impl(float* dst, const float* src,
- const float scalar, unsigned int num_points);
+extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst,
+ const float* src,
+ const float scalar,
+ unsigned int num_points);
-static inline void
-volk_32f_s32f_multiply_32f_u_orc(float* cVector, const float* aVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector,
+ const float* aVector,
+ const float scalar,
+ unsigned int num_points)
{
- volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);
+ volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);
}
#endif /* LV_HAVE_GENERIC */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_s32f_normalize(float* vecBuffer, const float scalar, unsigned int num_points)
- * \endcode
+ * void volk_32f_s32f_normalize(float* vecBuffer, const float scalar, unsigned int
+ * num_points) \endcode
*
* \b Inputs
* \li vecBuffer: The buffer of values to be vectorized.
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void volk_32f_s32f_normalize_a_avx(float* vecBuffer, const float scalar, unsigned int num_points){
- unsigned int number = 0;
- float* inputPtr = vecBuffer;
+static inline void volk_32f_s32f_normalize_a_avx(float* vecBuffer,
+ const float scalar,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ float* inputPtr = vecBuffer;
- const float invScalar = 1.0 / scalar;
- __m256 vecScalar = _mm256_set1_ps(invScalar);
+ const float invScalar = 1.0 / scalar;
+ __m256 vecScalar = _mm256_set1_ps(invScalar);
- __m256 input1;
+ __m256 input1;
- const uint64_t eighthPoints = num_points / 8;
- for(;number < eighthPoints; number++){
+ const uint64_t eighthPoints = num_points / 8;
+ for (; number < eighthPoints; number++) {
- input1 = _mm256_load_ps(inputPtr);
+ input1 = _mm256_load_ps(inputPtr);
- input1 = _mm256_mul_ps(input1, vecScalar);
+ input1 = _mm256_mul_ps(input1, vecScalar);
- _mm256_store_ps(inputPtr, input1);
+ _mm256_store_ps(inputPtr, input1);
- inputPtr += 8;
- }
+ inputPtr += 8;
+ }
- number = eighthPoints*8;
- for(; number < num_points; number++){
- *inputPtr *= invScalar;
- inputPtr++;
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *inputPtr *= invScalar;
+ inputPtr++;
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void volk_32f_s32f_normalize_a_sse(float* vecBuffer, const float scalar, unsigned int num_points){
- unsigned int number = 0;
- float* inputPtr = vecBuffer;
+static inline void volk_32f_s32f_normalize_a_sse(float* vecBuffer,
+ const float scalar,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ float* inputPtr = vecBuffer;
- const float invScalar = 1.0 / scalar;
- __m128 vecScalar = _mm_set_ps1(invScalar);
+ const float invScalar = 1.0 / scalar;
+ __m128 vecScalar = _mm_set_ps1(invScalar);
- __m128 input1;
+ __m128 input1;
- const uint64_t quarterPoints = num_points / 4;
- for(;number < quarterPoints; number++){
+ const uint64_t quarterPoints = num_points / 4;
+ for (; number < quarterPoints; number++) {
- input1 = _mm_load_ps(inputPtr);
+ input1 = _mm_load_ps(inputPtr);
- input1 = _mm_mul_ps(input1, vecScalar);
+ input1 = _mm_mul_ps(input1, vecScalar);
- _mm_store_ps(inputPtr, input1);
+ _mm_store_ps(inputPtr, input1);
- inputPtr += 4;
- }
+ inputPtr += 4;
+ }
- number = quarterPoints*4;
- for(; number < num_points; number++){
- *inputPtr *= invScalar;
- inputPtr++;
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *inputPtr *= invScalar;
+ inputPtr++;
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void volk_32f_s32f_normalize_generic(float* vecBuffer, const float scalar, unsigned int num_points){
- unsigned int number = 0;
- float* inputPtr = vecBuffer;
- const float invScalar = 1.0 / scalar;
- for(number = 0; number < num_points; number++){
- *inputPtr *= invScalar;
- inputPtr++;
- }
+static inline void volk_32f_s32f_normalize_generic(float* vecBuffer,
+ const float scalar,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ float* inputPtr = vecBuffer;
+ const float invScalar = 1.0 / scalar;
+ for (number = 0; number < num_points; number++) {
+ *inputPtr *= invScalar;
+ inputPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC
-extern void volk_32f_s32f_normalize_a_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points);
-static inline void volk_32f_s32f_normalize_u_orc(float* vecBuffer, const float scalar, unsigned int num_points){
+extern void volk_32f_s32f_normalize_a_orc_impl(float* dst,
+ float* src,
+ const float scalar,
+ unsigned int num_points);
+static inline void volk_32f_s32f_normalize_u_orc(float* vecBuffer,
+ const float scalar,
+ unsigned int num_points)
+{
float invscalar = 1.0 / scalar;
volk_32f_s32f_normalize_a_orc_impl(vecBuffer, vecBuffer, invscalar, num_points);
}
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void volk_32f_s32f_normalize_u_avx(float* vecBuffer, const float scalar, unsigned int num_points){
- unsigned int number = 0;
- float* inputPtr = vecBuffer;
+static inline void volk_32f_s32f_normalize_u_avx(float* vecBuffer,
+ const float scalar,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ float* inputPtr = vecBuffer;
- const float invScalar = 1.0 / scalar;
- __m256 vecScalar = _mm256_set1_ps(invScalar);
+ const float invScalar = 1.0 / scalar;
+ __m256 vecScalar = _mm256_set1_ps(invScalar);
- __m256 input1;
+ __m256 input1;
- const uint64_t eighthPoints = num_points / 8;
- for(;number < eighthPoints; number++){
+ const uint64_t eighthPoints = num_points / 8;
+ for (; number < eighthPoints; number++) {
- input1 = _mm256_loadu_ps(inputPtr);
+ input1 = _mm256_loadu_ps(inputPtr);
- input1 = _mm256_mul_ps(input1, vecScalar);
+ input1 = _mm256_mul_ps(input1, vecScalar);
- _mm256_storeu_ps(inputPtr, input1);
+ _mm256_storeu_ps(inputPtr, input1);
- inputPtr += 8;
- }
+ inputPtr += 8;
+ }
- number = eighthPoints*8;
- for(; number < num_points; number++){
- *inputPtr *= invScalar;
- inputPtr++;
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *inputPtr *= invScalar;
+ inputPtr++;
+ }
}
#endif /* LV_HAVE_AVX */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_s32f_power_32f(float* cVector, const float* aVector, const float power, unsigned int num_points)
- * \endcode
+ * void volk_32f_s32f_power_32f(float* cVector, const float* aVector, const float power,
+ * unsigned int num_points) \endcode
*
* \b Inputs
* \li aVector: The input vector of floats.
#define INCLUDED_volk_32f_s32f_power_32f_a_H
#include <inttypes.h>
-#include <stdio.h>
#include <math.h>
+#include <stdio.h>
#ifdef LV_HAVE_SSE4_1
#include <tmmintrin.h>
#include <simdmath.h>
#endif /* LV_HAVE_LIB_SIMDMATH */
-static inline void
-volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* aVector,
- const float power, unsigned int num_points)
+static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector,
+ const float* aVector,
+ const float power,
+ unsigned int num_points)
{
- unsigned int number = 0;
+ unsigned int number = 0;
- float* cPtr = cVector;
- const float* aPtr = aVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
#ifdef LV_HAVE_LIB_SIMDMATH
- const unsigned int quarterPoints = num_points / 4;
- __m128 vPower = _mm_set_ps1(power);
- __m128 zeroValue = _mm_setzero_ps();
- __m128 signMask;
- __m128 negatedValues;
- __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
- __m128 onesMask = _mm_set_ps1(1);
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 vPower = _mm_set_ps1(power);
+ __m128 zeroValue = _mm_setzero_ps();
+ __m128 signMask;
+ __m128 negatedValues;
+ __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
+ __m128 onesMask = _mm_set_ps1(1);
- __m128 aVal, cVal;
- for(;number < quarterPoints; number++){
+ __m128 aVal, cVal;
+ for (; number < quarterPoints; number++) {
- aVal = _mm_load_ps(aPtr);
- signMask = _mm_cmplt_ps(aVal, zeroValue);
- negatedValues = _mm_sub_ps(zeroValue, aVal);
- aVal = _mm_blendv_ps(aVal, negatedValues, signMask);
+ aVal = _mm_load_ps(aPtr);
+ signMask = _mm_cmplt_ps(aVal, zeroValue);
+ negatedValues = _mm_sub_ps(zeroValue, aVal);
+ aVal = _mm_blendv_ps(aVal, negatedValues, signMask);
- // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
- cVal = powf4(aVal, vPower); // Takes each input value to the specified power
+ // powf4 doesn't support negative values in the base, so we mask them off and then
+ // apply the negative after
+ cVal = powf4(aVal, vPower); // Takes each input value to the specified power
- cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal);
+ cVal = _mm_mul_ps(_mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal);
- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ cPtr += 4;
+ }
- number = quarterPoints * 4;
+ number = quarterPoints * 4;
#endif /* LV_HAVE_LIB_SIMDMATH */
- for(;number < num_points; number++){
- *cPtr++ = powf((*aPtr++), power);
- }
+ for (; number < num_points; number++) {
+ *cPtr++ = powf((*aPtr++), power);
+ }
}
#endif /* LV_HAVE_SSE4_1 */
#include <simdmath.h>
#endif /* LV_HAVE_LIB_SIMDMATH */
-static inline void
-volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aVector,
- const float power, unsigned int num_points)
+static inline void volk_32f_s32f_power_32f_a_sse(float* cVector,
+ const float* aVector,
+ const float power,
+ unsigned int num_points)
{
- unsigned int number = 0;
+ unsigned int number = 0;
- float* cPtr = cVector;
- const float* aPtr = aVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
#ifdef LV_HAVE_LIB_SIMDMATH
- const unsigned int quarterPoints = num_points / 4;
- __m128 vPower = _mm_set_ps1(power);
- __m128 zeroValue = _mm_setzero_ps();
- __m128 signMask;
- __m128 negatedValues;
- __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
- __m128 onesMask = _mm_set_ps1(1);
-
- __m128 aVal, cVal;
- for(;number < quarterPoints; number++){
-
- aVal = _mm_load_ps(aPtr);
- signMask = _mm_cmplt_ps(aVal, zeroValue);
- negatedValues = _mm_sub_ps(zeroValue, aVal);
- aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) );
-
- // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
- cVal = powf4(aVal, vPower); // Takes each input value to the specified power
-
- cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal);
-
- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
- aPtr += 4;
- cPtr += 4;
- }
-
- number = quarterPoints * 4;
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 vPower = _mm_set_ps1(power);
+ __m128 zeroValue = _mm_setzero_ps();
+ __m128 signMask;
+ __m128 negatedValues;
+ __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
+ __m128 onesMask = _mm_set_ps1(1);
+
+ __m128 aVal, cVal;
+ for (; number < quarterPoints; number++) {
+
+ aVal = _mm_load_ps(aPtr);
+ signMask = _mm_cmplt_ps(aVal, zeroValue);
+ negatedValues = _mm_sub_ps(zeroValue, aVal);
+ aVal =
+ _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues));
+
+ // powf4 doesn't support negative values in the base, so we mask them off and then
+ // apply the negative after
+ cVal = powf4(aVal, vPower); // Takes each input value to the specified power
+
+ cVal = _mm_mul_ps(_mm_or_ps(_mm_andnot_ps(signMask, onesMask),
+ _mm_and_ps(signMask, negativeOneToPower)),
+ cVal);
+
+ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
#endif /* LV_HAVE_LIB_SIMDMATH */
- for(;number < num_points; number++){
- *cPtr++ = powf((*aPtr++), power);
- }
+ for (; number < num_points; number++) {
+ *cPtr++ = powf((*aPtr++), power);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_s32f_power_32f_generic(float* cVector, const float* aVector,
- const float power, unsigned int num_points)
+static inline void volk_32f_s32f_power_32f_generic(float* cVector,
+ const float* aVector,
+ const float power,
+ unsigned int num_points)
{
- float* cPtr = cVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- *cPtr++ = powf((*aPtr++), power);
- }
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = powf((*aPtr++), power);
+ }
}
#endif /* LV_HAVE_GENERIC */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_s32f_s32f_mod_range_32f(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
- * \endcode
+ * void volk_32f_s32f_s32f_mod_range_32f(float* outputVector, const float* inputVector,
+ * const float lower_bound, const float upper_bound, unsigned int num_points) \endcode
*
* \b Inputs
* \li inputVector: The input vector
#ifdef LV_HAVE_AVX
#include <xmmintrin.h>
-static inline void volk_32f_s32f_s32f_mod_range_32f_u_avx(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
- __m256 lower = _mm256_set1_ps(lower_bound);
- __m256 upper = _mm256_set1_ps(upper_bound);
- __m256 distance = _mm256_sub_ps(upper,lower);
- float dist = upper_bound - lower_bound;
- __m256 input, output;
- __m256 is_smaller, is_bigger;
- __m256 excess, adj;
-
- const float *inPtr = inputVector;
- float *outPtr = outputVector;
- size_t eight_points = num_points / 8;
- size_t counter;
- for(counter = 0; counter < eight_points; counter++) {
- input = _mm256_loadu_ps(inPtr);
- // calculate mask: input < lower, input > upper
- is_smaller = _mm256_cmp_ps(input, lower, _CMP_LT_OQ); //0x11: Less than, ordered, non-signalling
- is_bigger = _mm256_cmp_ps(input, upper, _CMP_GT_OQ); //0x1e: greater than, ordered, non-signalling
- // find out how far we are out-of-bound – positive values!
- excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
- excess = _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
- // how many do we have to add? (int(excess/distance+1)*distance)
- excess = _mm256_div_ps(excess, distance);
- // round down
- excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
- // plus 1
- adj = _mm256_set1_ps(1.0f);
- excess = _mm256_add_ps(excess, adj);
- // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
- adj = _mm256_and_ps(adj, is_smaller);
- adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
- // scale by distance, sign
- excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
- output = _mm256_add_ps(input, excess);
- _mm256_storeu_ps(outPtr, output);
- inPtr += 8;
- outPtr += 8;
- }
-
- size_t cnt;
- for(cnt = eight_points * 8; cnt < num_points; cnt++){
- float val = inputVector[cnt];
- if(val < lower_bound){
- float excess = lower_bound - val;
- signed int count = (int)(excess/dist);
- outputVector[cnt] = val + (count+1)*dist;
+static inline void volk_32f_s32f_s32f_mod_range_32f_u_avx(float* outputVector,
+ const float* inputVector,
+ const float lower_bound,
+ const float upper_bound,
+ unsigned int num_points)
+{
+ __m256 lower = _mm256_set1_ps(lower_bound);
+ __m256 upper = _mm256_set1_ps(upper_bound);
+ __m256 distance = _mm256_sub_ps(upper, lower);
+ float dist = upper_bound - lower_bound;
+ __m256 input, output;
+ __m256 is_smaller, is_bigger;
+ __m256 excess, adj;
+
+ const float* inPtr = inputVector;
+ float* outPtr = outputVector;
+ size_t eight_points = num_points / 8;
+ size_t counter;
+ for (counter = 0; counter < eight_points; counter++) {
+ input = _mm256_loadu_ps(inPtr);
+ // calculate mask: input < lower, input > upper
+ is_smaller = _mm256_cmp_ps(
+ input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling
+ is_bigger = _mm256_cmp_ps(
+ input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling
+ // find out how far we are out-of-bound – positive values!
+ excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
+ excess =
+ _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
+ // how many do we have to add? (int(excess/distance+1)*distance)
+ excess = _mm256_div_ps(excess, distance);
+ // round down
+ excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
+ // plus 1
+ adj = _mm256_set1_ps(1.0f);
+ excess = _mm256_add_ps(excess, adj);
+ // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
+ adj = _mm256_and_ps(adj, is_smaller);
+ adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
+ // scale by distance, sign
+ excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
+ output = _mm256_add_ps(input, excess);
+ _mm256_storeu_ps(outPtr, output);
+ inPtr += 8;
+ outPtr += 8;
}
- else if(val > upper_bound){
- float excess = val - upper_bound;
- signed int count = (int)(excess/dist);
- outputVector[cnt] = val - (count+1)*dist;
+
+ size_t cnt;
+ for (cnt = eight_points * 8; cnt < num_points; cnt++) {
+ float val = inputVector[cnt];
+ if (val < lower_bound) {
+ float excess = lower_bound - val;
+ signed int count = (int)(excess / dist);
+ outputVector[cnt] = val + (count + 1) * dist;
+ } else if (val > upper_bound) {
+ float excess = val - upper_bound;
+ signed int count = (int)(excess / dist);
+ outputVector[cnt] = val - (count + 1) * dist;
+ } else
+ outputVector[cnt] = val;
}
- else
- outputVector[cnt] = val;
- }
}
-static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
- __m256 lower = _mm256_set1_ps(lower_bound);
- __m256 upper = _mm256_set1_ps(upper_bound);
- __m256 distance = _mm256_sub_ps(upper,lower);
- float dist = upper_bound - lower_bound;
- __m256 input, output;
- __m256 is_smaller, is_bigger;
- __m256 excess, adj;
-
- const float *inPtr = inputVector;
- float *outPtr = outputVector;
- size_t eight_points = num_points / 8;
- size_t counter;
- for(counter = 0; counter < eight_points; counter++) {
- input = _mm256_load_ps(inPtr);
- // calculate mask: input < lower, input > upper
- is_smaller = _mm256_cmp_ps(input, lower, _CMP_LT_OQ); //0x11: Less than, ordered, non-signalling
- is_bigger = _mm256_cmp_ps(input, upper, _CMP_GT_OQ); //0x1e: greater than, ordered, non-signalling
- // find out how far we are out-of-bound – positive values!
- excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
- excess = _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
- // how many do we have to add? (int(excess/distance+1)*distance)
- excess = _mm256_div_ps(excess, distance);
- // round down
- excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
- // plus 1
- adj = _mm256_set1_ps(1.0f);
- excess = _mm256_add_ps(excess, adj);
- // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
- adj = _mm256_and_ps(adj, is_smaller);
- adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
- // scale by distance, sign
- excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
- output = _mm256_add_ps(input, excess);
- _mm256_store_ps(outPtr, output);
- inPtr += 8;
- outPtr += 8;
- }
-
- size_t cnt;
- for(cnt = eight_points * 8; cnt < num_points; cnt++){
- float val = inputVector[cnt];
- if(val < lower_bound){
- float excess = lower_bound - val;
- signed int count = (int)(excess/dist);
- outputVector[cnt] = val + (count+1)*dist;
+static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector,
+ const float* inputVector,
+ const float lower_bound,
+ const float upper_bound,
+ unsigned int num_points)
+{
+ __m256 lower = _mm256_set1_ps(lower_bound);
+ __m256 upper = _mm256_set1_ps(upper_bound);
+ __m256 distance = _mm256_sub_ps(upper, lower);
+ float dist = upper_bound - lower_bound;
+ __m256 input, output;
+ __m256 is_smaller, is_bigger;
+ __m256 excess, adj;
+
+ const float* inPtr = inputVector;
+ float* outPtr = outputVector;
+ size_t eight_points = num_points / 8;
+ size_t counter;
+ for (counter = 0; counter < eight_points; counter++) {
+ input = _mm256_load_ps(inPtr);
+ // calculate mask: input < lower, input > upper
+ is_smaller = _mm256_cmp_ps(
+ input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling
+ is_bigger = _mm256_cmp_ps(
+ input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling
+ // find out how far we are out-of-bound – positive values!
+ excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
+ excess =
+ _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
+ // how many do we have to add? (int(excess/distance+1)*distance)
+ excess = _mm256_div_ps(excess, distance);
+ // round down
+ excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
+ // plus 1
+ adj = _mm256_set1_ps(1.0f);
+ excess = _mm256_add_ps(excess, adj);
+ // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
+ adj = _mm256_and_ps(adj, is_smaller);
+ adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
+ // scale by distance, sign
+ excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
+ output = _mm256_add_ps(input, excess);
+ _mm256_store_ps(outPtr, output);
+ inPtr += 8;
+ outPtr += 8;
}
- else if(val > upper_bound){
- float excess = val - upper_bound;
- signed int count = (int)(excess/dist);
- outputVector[cnt] = val - (count+1)*dist;
+
+ size_t cnt;
+ for (cnt = eight_points * 8; cnt < num_points; cnt++) {
+ float val = inputVector[cnt];
+ if (val < lower_bound) {
+ float excess = lower_bound - val;
+ signed int count = (int)(excess / dist);
+ outputVector[cnt] = val + (count + 1) * dist;
+ } else if (val > upper_bound) {
+ float excess = val - upper_bound;
+ signed int count = (int)(excess / dist);
+ outputVector[cnt] = val - (count + 1) * dist;
+ } else
+ outputVector[cnt] = val;
}
- else
- outputVector[cnt] = val;
- }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE2
#include <xmmintrin.h>
-static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
- __m128 lower = _mm_set_ps1(lower_bound);
- __m128 upper = _mm_set_ps1(upper_bound);
- __m128 distance = _mm_sub_ps(upper,lower);
- float dist = upper_bound - lower_bound;
- __m128 input, output;
- __m128 is_smaller, is_bigger;
- __m128 excess, adj;
-
- const float *inPtr = inputVector;
- float *outPtr = outputVector;
- size_t quarter_points = num_points / 4;
- size_t counter;
- for(counter = 0; counter < quarter_points; counter++) {
- input = _mm_load_ps(inPtr);
- // calculate mask: input < lower, input > upper
- is_smaller = _mm_cmplt_ps(input, lower);
- is_bigger = _mm_cmpgt_ps(input, upper);
- // find out how far we are out-of-bound – positive values!
- excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
- excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
- // how many do we have to add? (int(excess/distance+1)*distance)
- excess = _mm_div_ps(excess, distance);
- // round down
- excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
- // plus 1
- adj = _mm_set_ps1(1.0f);
- excess = _mm_add_ps(excess, adj);
- // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
- adj = _mm_and_ps(adj, is_smaller);
- adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
- // scale by distance, sign
- excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
- output = _mm_add_ps(input, excess);
- _mm_store_ps(outPtr, output);
- inPtr += 4;
- outPtr += 4;
- }
-
- size_t cnt;
- for(cnt = quarter_points * 4; cnt < num_points; cnt++){
- float val = inputVector[cnt];
- if(val < lower_bound){
- float excess = lower_bound - val;
- signed int count = (int)(excess/dist);
- outputVector[cnt] = val + (count+1)*dist;
+static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float* outputVector,
+ const float* inputVector,
+ const float lower_bound,
+ const float upper_bound,
+ unsigned int num_points)
+{
+ __m128 lower = _mm_set_ps1(lower_bound);
+ __m128 upper = _mm_set_ps1(upper_bound);
+ __m128 distance = _mm_sub_ps(upper, lower);
+ float dist = upper_bound - lower_bound;
+ __m128 input, output;
+ __m128 is_smaller, is_bigger;
+ __m128 excess, adj;
+
+ const float* inPtr = inputVector;
+ float* outPtr = outputVector;
+ size_t quarter_points = num_points / 4;
+ size_t counter;
+ for (counter = 0; counter < quarter_points; counter++) {
+ input = _mm_load_ps(inPtr);
+ // calculate mask: input < lower, input > upper
+ is_smaller = _mm_cmplt_ps(input, lower);
+ is_bigger = _mm_cmpgt_ps(input, upper);
+ // find out how far we are out-of-bound – positive values!
+ excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
+ excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
+ // how many do we have to add? (int(excess/distance+1)*distance)
+ excess = _mm_div_ps(excess, distance);
+ // round down
+ excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
+ // plus 1
+ adj = _mm_set_ps1(1.0f);
+ excess = _mm_add_ps(excess, adj);
+ // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
+ adj = _mm_and_ps(adj, is_smaller);
+ adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
+ // scale by distance, sign
+ excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
+ output = _mm_add_ps(input, excess);
+ _mm_store_ps(outPtr, output);
+ inPtr += 4;
+ outPtr += 4;
}
- else if(val > upper_bound){
- float excess = val - upper_bound;
- signed int count = (int)(excess/dist);
- outputVector[cnt] = val - (count+1)*dist;
+
+ size_t cnt;
+ for (cnt = quarter_points * 4; cnt < num_points; cnt++) {
+ float val = inputVector[cnt];
+ if (val < lower_bound) {
+ float excess = lower_bound - val;
+ signed int count = (int)(excess / dist);
+ outputVector[cnt] = val + (count + 1) * dist;
+ } else if (val > upper_bound) {
+ float excess = val - upper_bound;
+ signed int count = (int)(excess / dist);
+ outputVector[cnt] = val - (count + 1) * dist;
+ } else
+ outputVector[cnt] = val;
}
- else
- outputVector[cnt] = val;
- }
}
-static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
- __m128 lower = _mm_set_ps1(lower_bound);
- __m128 upper = _mm_set_ps1(upper_bound);
- __m128 distance = _mm_sub_ps(upper,lower);
- __m128 input, output;
- __m128 is_smaller, is_bigger;
- __m128 excess, adj;
-
- const float *inPtr = inputVector;
- float *outPtr = outputVector;
- size_t quarter_points = num_points / 4;
- size_t counter;
- for(counter = 0; counter < quarter_points; counter++) {
- input = _mm_load_ps(inPtr);
- // calculate mask: input < lower, input > upper
- is_smaller = _mm_cmplt_ps(input, lower);
- is_bigger = _mm_cmpgt_ps(input, upper);
- // find out how far we are out-of-bound – positive values!
- excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
- excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
- // how many do we have to add? (int(excess/distance+1)*distance)
- excess = _mm_div_ps(excess, distance);
- // round down – for some reason, SSE doesn't come with a 4x float -> 4x int32 conversion.
- excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
- // plus 1
- adj = _mm_set_ps1(1.0f);
- excess = _mm_add_ps(excess, adj);
- // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
- adj = _mm_and_ps(adj, is_smaller);
- adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
- // scale by distance, sign
- excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
- output = _mm_add_ps(input, excess);
- _mm_store_ps(outPtr, output);
- inPtr += 4;
- outPtr += 4;
- }
-
- float dist = upper_bound - lower_bound;
- size_t cnt;
- for(cnt = quarter_points * 4; cnt < num_points; cnt++){
- float val = inputVector[cnt];
- if(val < lower_bound){
- float excess = lower_bound - val;
- signed int count = (int)(excess/dist);
- outputVector[cnt] = val + (count+1)*dist;
+static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float* outputVector,
+ const float* inputVector,
+ const float lower_bound,
+ const float upper_bound,
+ unsigned int num_points)
+{
+ __m128 lower = _mm_set_ps1(lower_bound);
+ __m128 upper = _mm_set_ps1(upper_bound);
+ __m128 distance = _mm_sub_ps(upper, lower);
+ __m128 input, output;
+ __m128 is_smaller, is_bigger;
+ __m128 excess, adj;
+
+ const float* inPtr = inputVector;
+ float* outPtr = outputVector;
+ size_t quarter_points = num_points / 4;
+ size_t counter;
+ for (counter = 0; counter < quarter_points; counter++) {
+ input = _mm_load_ps(inPtr);
+ // calculate mask: input < lower, input > upper
+ is_smaller = _mm_cmplt_ps(input, lower);
+ is_bigger = _mm_cmpgt_ps(input, upper);
+ // find out how far we are out-of-bound – positive values!
+ excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
+ excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
+ // how many do we have to add? (int(excess/distance+1)*distance)
+ excess = _mm_div_ps(excess, distance);
+ // round down – for some reason, SSE doesn't come with a 4x float -> 4x int32
+ // conversion.
+ excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
+ // plus 1
+ adj = _mm_set_ps1(1.0f);
+ excess = _mm_add_ps(excess, adj);
+ // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
+ adj = _mm_and_ps(adj, is_smaller);
+ adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
+ // scale by distance, sign
+ excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
+ output = _mm_add_ps(input, excess);
+ _mm_store_ps(outPtr, output);
+ inPtr += 4;
+ outPtr += 4;
}
- else if(val > upper_bound){
- float excess = val - upper_bound;
- signed int count = (int)(excess/dist);
- outputVector[cnt] = val - (count+1)*dist;
+
+ float dist = upper_bound - lower_bound;
+ size_t cnt;
+ for (cnt = quarter_points * 4; cnt < num_points; cnt++) {
+ float val = inputVector[cnt];
+ if (val < lower_bound) {
+ float excess = lower_bound - val;
+ signed int count = (int)(excess / dist);
+ outputVector[cnt] = val + (count + 1) * dist;
+ } else if (val > upper_bound) {
+ float excess = val - upper_bound;
+ signed int count = (int)(excess / dist);
+ outputVector[cnt] = val - (count + 1) * dist;
+ } else
+ outputVector[cnt] = val;
}
- else
- outputVector[cnt] = val;
- }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
- __m128 lower = _mm_set_ps1(lower_bound);
- __m128 upper = _mm_set_ps1(upper_bound);
- __m128 distance = _mm_sub_ps(upper,lower);
- float dist = upper_bound - lower_bound;
- __m128 input, output;
- __m128 is_smaller, is_bigger;
- __m128 excess, adj;
- __m128i rounddown;
-
- const float *inPtr = inputVector;
- float *outPtr = outputVector;
- size_t quarter_points = num_points / 4;
- size_t counter;
- for(counter = 0; counter < quarter_points; counter++) {
- input = _mm_load_ps(inPtr);
- // calculate mask: input < lower, input > upper
- is_smaller = _mm_cmplt_ps(input, lower);
- is_bigger = _mm_cmpgt_ps(input, upper);
- // find out how far we are out-of-bound – positive values!
- excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
- excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
- // how many do we have to add? (int(excess/distance+1)*distance)
- excess = _mm_div_ps(excess, distance);
- // round down – for some reason
- rounddown = _mm_cvttps_epi32(excess);
- excess = _mm_cvtepi32_ps(rounddown);
- // plus 1
- adj = _mm_set_ps1(1.0f);
- excess = _mm_add_ps(excess, adj);
- // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
- adj = _mm_and_ps(adj, is_smaller);
- adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
- // scale by distance, sign
- excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
- output = _mm_add_ps(input, excess);
- _mm_store_ps(outPtr, output);
- inPtr += 4;
- outPtr += 4;
- }
-
- size_t cnt;
- for(cnt = quarter_points * 4; cnt < num_points; cnt++){
- float val = inputVector[cnt];
- if(val < lower_bound){
- float excess = lower_bound - val;
- signed int count = (int)(excess/dist);
- outputVector[cnt] = val + (count+1)*dist;
+static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse(float* outputVector,
+ const float* inputVector,
+ const float lower_bound,
+ const float upper_bound,
+ unsigned int num_points)
+{
+ __m128 lower = _mm_set_ps1(lower_bound);
+ __m128 upper = _mm_set_ps1(upper_bound);
+ __m128 distance = _mm_sub_ps(upper, lower);
+ float dist = upper_bound - lower_bound;
+ __m128 input, output;
+ __m128 is_smaller, is_bigger;
+ __m128 excess, adj;
+ __m128i rounddown;
+
+ const float* inPtr = inputVector;
+ float* outPtr = outputVector;
+ size_t quarter_points = num_points / 4;
+ size_t counter;
+ for (counter = 0; counter < quarter_points; counter++) {
+ input = _mm_load_ps(inPtr);
+ // calculate mask: input < lower, input > upper
+ is_smaller = _mm_cmplt_ps(input, lower);
+ is_bigger = _mm_cmpgt_ps(input, upper);
+ // find out how far we are out-of-bound – positive values!
+ excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
+ excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
+ // how many do we have to add? (int(excess/distance+1)*distance)
+ excess = _mm_div_ps(excess, distance);
+ // round down – for some reason
+ rounddown = _mm_cvttps_epi32(excess);
+ excess = _mm_cvtepi32_ps(rounddown);
+ // plus 1
+ adj = _mm_set_ps1(1.0f);
+ excess = _mm_add_ps(excess, adj);
+ // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
+ adj = _mm_and_ps(adj, is_smaller);
+ adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
+ // scale by distance, sign
+ excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
+ output = _mm_add_ps(input, excess);
+ _mm_store_ps(outPtr, output);
+ inPtr += 4;
+ outPtr += 4;
}
- else if(val > upper_bound){
- float excess = val - upper_bound;
- signed int count = (int)(excess/dist);
- outputVector[cnt] = val - (count+1)*dist;
+
+ size_t cnt;
+ for (cnt = quarter_points * 4; cnt < num_points; cnt++) {
+ float val = inputVector[cnt];
+ if (val < lower_bound) {
+ float excess = lower_bound - val;
+ signed int count = (int)(excess / dist);
+ outputVector[cnt] = val + (count + 1) * dist;
+ } else if (val > upper_bound) {
+ float excess = val - upper_bound;
+ signed int count = (int)(excess / dist);
+ outputVector[cnt] = val - (count + 1) * dist;
+ } else
+ outputVector[cnt] = val;
}
- else
- outputVector[cnt] = val;
- }
}
-static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
- __m128 lower = _mm_set_ps1(lower_bound);
- __m128 upper = _mm_set_ps1(upper_bound);
- __m128 distance = _mm_sub_ps(upper,lower);
- __m128 input, output;
- __m128 is_smaller, is_bigger;
- __m128 excess, adj;
- __m128i rounddown;
-
- const float *inPtr = inputVector;
- float *outPtr = outputVector;
- size_t quarter_points = num_points / 4;
- size_t counter;
- for(counter = 0; counter < quarter_points; counter++) {
- input = _mm_load_ps(inPtr);
- // calculate mask: input < lower, input > upper
- is_smaller = _mm_cmplt_ps(input, lower);
- is_bigger = _mm_cmpgt_ps(input, upper);
- // find out how far we are out-of-bound – positive values!
- excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
- excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
- // how many do we have to add? (int(excess/distance+1)*distance)
- excess = _mm_div_ps(excess, distance);
- // round down
- rounddown = _mm_cvttps_epi32(excess);
- excess = _mm_cvtepi32_ps(rounddown);
- // plus 1
- adj = _mm_set_ps1(1.0f);
- excess = _mm_add_ps(excess, adj);
- // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
- adj = _mm_and_ps(adj, is_smaller);
- adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
- // scale by distance, sign
- excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
- output = _mm_add_ps(input, excess);
- _mm_store_ps(outPtr, output);
- inPtr += 4;
- outPtr += 4;
- }
-
- float dist = upper_bound - lower_bound;
- size_t cnt;
- for(cnt = quarter_points * 4; cnt < num_points; cnt++){
- float val = inputVector[cnt];
- if(val < lower_bound){
- float excess = lower_bound - val;
- signed int count = (int)(excess/dist);
- outputVector[cnt] = val + (count+1)*dist;
+static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector,
+ const float* inputVector,
+ const float lower_bound,
+ const float upper_bound,
+ unsigned int num_points)
+{
+ __m128 lower = _mm_set_ps1(lower_bound);
+ __m128 upper = _mm_set_ps1(upper_bound);
+ __m128 distance = _mm_sub_ps(upper, lower);
+ __m128 input, output;
+ __m128 is_smaller, is_bigger;
+ __m128 excess, adj;
+ __m128i rounddown;
+
+ const float* inPtr = inputVector;
+ float* outPtr = outputVector;
+ size_t quarter_points = num_points / 4;
+ size_t counter;
+ for (counter = 0; counter < quarter_points; counter++) {
+ input = _mm_load_ps(inPtr);
+ // calculate mask: input < lower, input > upper
+ is_smaller = _mm_cmplt_ps(input, lower);
+ is_bigger = _mm_cmpgt_ps(input, upper);
+ // find out how far we are out-of-bound – positive values!
+ excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
+ excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
+ // how many do we have to add? (int(excess/distance+1)*distance)
+ excess = _mm_div_ps(excess, distance);
+ // round down
+ rounddown = _mm_cvttps_epi32(excess);
+ excess = _mm_cvtepi32_ps(rounddown);
+ // plus 1
+ adj = _mm_set_ps1(1.0f);
+ excess = _mm_add_ps(excess, adj);
+ // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
+ adj = _mm_and_ps(adj, is_smaller);
+ adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
+ // scale by distance, sign
+ excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
+ output = _mm_add_ps(input, excess);
+ _mm_store_ps(outPtr, output);
+ inPtr += 4;
+ outPtr += 4;
}
- else if(val > upper_bound){
- float excess = val - upper_bound;
- signed int count = (int)(excess/dist);
- outputVector[cnt] = val - (count+1)*dist;
+
+ float dist = upper_bound - lower_bound;
+ size_t cnt;
+ for (cnt = quarter_points * 4; cnt < num_points; cnt++) {
+ float val = inputVector[cnt];
+ if (val < lower_bound) {
+ float excess = lower_bound - val;
+ signed int count = (int)(excess / dist);
+ outputVector[cnt] = val + (count + 1) * dist;
+ } else if (val > upper_bound) {
+ float excess = val - upper_bound;
+ signed int count = (int)(excess / dist);
+ outputVector[cnt] = val - (count + 1) * dist;
+ } else
+ outputVector[cnt] = val;
}
- else
- outputVector[cnt] = val;
- }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void volk_32f_s32f_s32f_mod_range_32f_generic(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
- float* outPtr = outputVector;
- const float *inPtr;
- float distance = upper_bound - lower_bound;
-
- for(inPtr = inputVector; inPtr < inputVector + num_points; inPtr++){
- float val = *inPtr;
- if(val < lower_bound){
- float excess = lower_bound - val;
- signed int count = (int)(excess/distance);
- *outPtr = val + (count+1)*distance;
- }
- else if(val > upper_bound){
- float excess = val - upper_bound;
- signed int count = (int)(excess/distance);
- *outPtr = val - (count+1)*distance;
+static inline void volk_32f_s32f_s32f_mod_range_32f_generic(float* outputVector,
+ const float* inputVector,
+ const float lower_bound,
+ const float upper_bound,
+ unsigned int num_points)
+{
+ float* outPtr = outputVector;
+ const float* inPtr;
+ float distance = upper_bound - lower_bound;
+
+ for (inPtr = inputVector; inPtr < inputVector + num_points; inPtr++) {
+ float val = *inPtr;
+ if (val < lower_bound) {
+ float excess = lower_bound - val;
+ signed int count = (int)(excess / distance);
+ *outPtr = val + (count + 1) * distance;
+ } else if (val > upper_bound) {
+ float excess = val - upper_bound;
+ signed int count = (int)(excess / distance);
+ *outPtr = val - (count + 1) * distance;
+ } else
+ *outPtr = val;
+ outPtr++;
}
- else
- *outPtr = val;
- outPtr++;
- }
}
#endif /* LV_HAVE_GENERIC */
-
-
#endif /* INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_s32f_stddev_32f(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points)
- * \endcode
+ * void volk_32f_s32f_stddev_32f(float* stddev, const float* inputBuffer, const float
+ * mean, unsigned int num_points) \endcode
*
* \b Inputs
* \li inputBuffer: The input vector of floats.
#ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H
#define INCLUDED_volk_32f_s32f_stddev_32f_a_H
-#include <volk/volk_common.h>
#include <inttypes.h>
-#include <stdio.h>
#include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
-static inline void
-volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, const float* inputBuffer,
- const float mean, unsigned int num_points)
+static inline void volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev,
+ const float* inputBuffer,
+ const float mean,
+ unsigned int num_points)
{
- float returnValue = 0;
- if(num_points > 0){
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- const float* aPtr = inputBuffer;
-
- __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
-
- __m128 squareAccumulator = _mm_setzero_ps();
- __m128 aVal1, aVal2, aVal3, aVal4;
- __m128 cVal1, cVal2, cVal3, cVal4;
- for(;number < sixteenthPoints; number++) {
- aVal1 = _mm_load_ps(aPtr); aPtr += 4;
- cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
-
- aVal2 = _mm_load_ps(aPtr); aPtr += 4;
- cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
-
- aVal3 = _mm_load_ps(aPtr); aPtr += 4;
- cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
-
- aVal4 = _mm_load_ps(aPtr); aPtr += 4;
- cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
-
- cVal1 = _mm_or_ps(cVal1, cVal2);
- cVal3 = _mm_or_ps(cVal3, cVal4);
- cVal1 = _mm_or_ps(cVal1, cVal3);
-
- squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+ float returnValue = 0;
+ if (num_points > 0) {
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const float* aPtr = inputBuffer;
+
+ __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+
+ __m128 squareAccumulator = _mm_setzero_ps();
+ __m128 aVal1, aVal2, aVal3, aVal4;
+ __m128 cVal1, cVal2, cVal3, cVal4;
+ for (; number < sixteenthPoints; number++) {
+ aVal1 = _mm_load_ps(aPtr);
+ aPtr += 4;
+ cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
+
+ aVal2 = _mm_load_ps(aPtr);
+ aPtr += 4;
+ cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
+
+ aVal3 = _mm_load_ps(aPtr);
+ aPtr += 4;
+ cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
+
+ aVal4 = _mm_load_ps(aPtr);
+ aPtr += 4;
+ cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
+
+ cVal1 = _mm_or_ps(cVal1, cVal2);
+ cVal3 = _mm_or_ps(cVal3, cVal4);
+ cVal1 = _mm_or_ps(cVal1, cVal3);
+
+ squareAccumulator =
+ _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+ }
+ _mm_store_ps(squareBuffer,
+ squareAccumulator); // Store the results back into the C container
+ returnValue = squareBuffer[0];
+ returnValue += squareBuffer[1];
+ returnValue += squareBuffer[2];
+ returnValue += squareBuffer[3];
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ returnValue += (*aPtr) * (*aPtr);
+ aPtr++;
+ }
+ returnValue /= num_points;
+ returnValue -= (mean * mean);
+ returnValue = sqrtf(returnValue);
}
- _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
- returnValue = squareBuffer[0];
- returnValue += squareBuffer[1];
- returnValue += squareBuffer[2];
- returnValue += squareBuffer[3];
-
- number = sixteenthPoints * 16;
- for(;number < num_points; number++){
- returnValue += (*aPtr) * (*aPtr);
- aPtr++;
- }
- returnValue /= num_points;
- returnValue -= (mean * mean);
- returnValue = sqrtf(returnValue);
- }
- *stddev = returnValue;
+ *stddev = returnValue;
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32f_s32f_stddev_32f_a_sse(float* stddev, const float* inputBuffer,
- const float mean, unsigned int num_points)
+static inline void volk_32f_s32f_stddev_32f_a_sse(float* stddev,
+ const float* inputBuffer,
+ const float mean,
+ unsigned int num_points)
{
- float returnValue = 0;
- if(num_points > 0){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- const float* aPtr = inputBuffer;
-
- __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
-
- __m128 squareAccumulator = _mm_setzero_ps();
- __m128 aVal = _mm_setzero_ps();
- for(;number < quarterPoints; number++) {
- aVal = _mm_load_ps(aPtr); // aVal = x
- aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
- squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
- aPtr += 4;
- }
- _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
- returnValue = squareBuffer[0];
- returnValue += squareBuffer[1];
- returnValue += squareBuffer[2];
- returnValue += squareBuffer[3];
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- returnValue += (*aPtr) * (*aPtr);
- aPtr++;
+ float returnValue = 0;
+ if (num_points > 0) {
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* aPtr = inputBuffer;
+
+ __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+
+ __m128 squareAccumulator = _mm_setzero_ps();
+ __m128 aVal = _mm_setzero_ps();
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr); // aVal = x
+ aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
+ squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
+ aPtr += 4;
+ }
+ _mm_store_ps(squareBuffer,
+ squareAccumulator); // Store the results back into the C container
+ returnValue = squareBuffer[0];
+ returnValue += squareBuffer[1];
+ returnValue += squareBuffer[2];
+ returnValue += squareBuffer[3];
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ returnValue += (*aPtr) * (*aPtr);
+ aPtr++;
+ }
+ returnValue /= num_points;
+ returnValue -= (mean * mean);
+ returnValue = sqrtf(returnValue);
}
- returnValue /= num_points;
- returnValue -= (mean * mean);
- returnValue = sqrtf(returnValue);
- }
- *stddev = returnValue;
+ *stddev = returnValue;
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_s32f_stddev_32f_a_avx(float* stddev, const float* inputBuffer,
- const float mean, unsigned int num_points)
+static inline void volk_32f_s32f_stddev_32f_a_avx(float* stddev,
+ const float* inputBuffer,
+ const float mean,
+ unsigned int num_points)
{
- float stdDev = 0;
- if(num_points > 0){
- unsigned int number = 0;
- const unsigned int thirtySecondthPoints = num_points / 32;
-
- const float* aPtr = inputBuffer;
- __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
-
- __m256 squareAccumulator = _mm256_setzero_ps();
- __m256 aVal1, aVal2, aVal3, aVal4;
- __m256 cVal1, cVal2, cVal3, cVal4;
- for(;number < thirtySecondthPoints; number++) {
- aVal1 = _mm256_load_ps(aPtr); aPtr += 8;
- cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
-
- aVal2 = _mm256_load_ps(aPtr); aPtr += 8;
- cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
-
- aVal3 = _mm256_load_ps(aPtr); aPtr += 8;
- cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
-
- aVal4 = _mm256_load_ps(aPtr); aPtr += 8;
- cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
-
- cVal1 = _mm256_or_ps(cVal1, cVal2);
- cVal3 = _mm256_or_ps(cVal3, cVal4);
- cVal1 = _mm256_or_ps(cVal1, cVal3);
-
- squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+ float stdDev = 0;
+ if (num_points > 0) {
+ unsigned int number = 0;
+ const unsigned int thirtySecondthPoints = num_points / 32;
+
+ const float* aPtr = inputBuffer;
+ __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
+
+ __m256 squareAccumulator = _mm256_setzero_ps();
+ __m256 aVal1, aVal2, aVal3, aVal4;
+ __m256 cVal1, cVal2, cVal3, cVal4;
+ for (; number < thirtySecondthPoints; number++) {
+ aVal1 = _mm256_load_ps(aPtr);
+ aPtr += 8;
+ cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
+
+ aVal2 = _mm256_load_ps(aPtr);
+ aPtr += 8;
+ cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
+
+ aVal3 = _mm256_load_ps(aPtr);
+ aPtr += 8;
+ cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
+
+ aVal4 = _mm256_load_ps(aPtr);
+ aPtr += 8;
+ cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
+
+ cVal1 = _mm256_or_ps(cVal1, cVal2);
+ cVal3 = _mm256_or_ps(cVal3, cVal4);
+ cVal1 = _mm256_or_ps(cVal1, cVal3);
+
+ squareAccumulator =
+ _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+ }
+ _mm256_store_ps(squareBuffer,
+ squareAccumulator); // Store the results back into the C container
+ stdDev = squareBuffer[0];
+ stdDev += squareBuffer[1];
+ stdDev += squareBuffer[2];
+ stdDev += squareBuffer[3];
+ stdDev += squareBuffer[4];
+ stdDev += squareBuffer[5];
+ stdDev += squareBuffer[6];
+ stdDev += squareBuffer[7];
+
+ number = thirtySecondthPoints * 32;
+ for (; number < num_points; number++) {
+ stdDev += (*aPtr) * (*aPtr);
+ aPtr++;
+ }
+ stdDev /= num_points;
+ stdDev -= (mean * mean);
+ stdDev = sqrtf(stdDev);
}
- _mm256_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
- stdDev = squareBuffer[0];
- stdDev += squareBuffer[1];
- stdDev += squareBuffer[2];
- stdDev += squareBuffer[3];
- stdDev += squareBuffer[4];
- stdDev += squareBuffer[5];
- stdDev += squareBuffer[6];
- stdDev += squareBuffer[7];
-
- number = thirtySecondthPoints * 32;
- for(;number < num_points; number++){
- stdDev += (*aPtr) * (*aPtr);
- aPtr++;
- }
- stdDev /= num_points;
- stdDev -= (mean * mean);
- stdDev = sqrtf(stdDev);
- }
- *stddev = stdDev;
-
+ *stddev = stdDev;
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_s32f_stddev_32f_generic(float* stddev, const float* inputBuffer,
- const float mean, unsigned int num_points)
+static inline void volk_32f_s32f_stddev_32f_generic(float* stddev,
+ const float* inputBuffer,
+ const float mean,
+ unsigned int num_points)
{
- float returnValue = 0;
- if(num_points > 0){
- const float* aPtr = inputBuffer;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- returnValue += (*aPtr) * (*aPtr);
- aPtr++;
+ float returnValue = 0;
+ if (num_points > 0) {
+ const float* aPtr = inputBuffer;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ returnValue += (*aPtr) * (*aPtr);
+ aPtr++;
+ }
+
+ returnValue /= num_points;
+ returnValue -= (mean * mean);
+ returnValue = sqrtf(returnValue);
}
-
- returnValue /= num_points;
- returnValue -= (mean * mean);
- returnValue = sqrtf(returnValue);
- }
- *stddev = returnValue;
+ *stddev = returnValue;
}
#endif /* LV_HAVE_GENERIC */
#ifndef INCLUDED_volk_32f_s32f_stddev_32f_u_H
#define INCLUDED_volk_32f_s32f_stddev_32f_u_H
-#include <volk/volk_common.h>
#include <inttypes.h>
-#include <stdio.h>
#include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_s32f_stddev_32f_u_avx(float* stddev, const float* inputBuffer,
- const float mean, unsigned int num_points)
+static inline void volk_32f_s32f_stddev_32f_u_avx(float* stddev,
+ const float* inputBuffer,
+ const float mean,
+ unsigned int num_points)
{
- float stdDev = 0;
- if(num_points > 0){
- unsigned int number = 0;
- const unsigned int thirtySecondthPoints = num_points / 32;
-
- const float* aPtr = inputBuffer;
- __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
-
- __m256 squareAccumulator = _mm256_setzero_ps();
- __m256 aVal1, aVal2, aVal3, aVal4;
- __m256 cVal1, cVal2, cVal3, cVal4;
- for(;number < thirtySecondthPoints; number++) {
- aVal1 = _mm256_loadu_ps(aPtr); aPtr += 8;
- cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
-
- aVal2 = _mm256_loadu_ps(aPtr); aPtr += 8;
- cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
-
- aVal3 = _mm256_loadu_ps(aPtr); aPtr += 8;
- cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
-
- aVal4 = _mm256_loadu_ps(aPtr); aPtr += 8;
- cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
-
- cVal1 = _mm256_or_ps(cVal1, cVal2);
- cVal3 = _mm256_or_ps(cVal3, cVal4);
- cVal1 = _mm256_or_ps(cVal1, cVal3);
-
- squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+ float stdDev = 0;
+ if (num_points > 0) {
+ unsigned int number = 0;
+ const unsigned int thirtySecondthPoints = num_points / 32;
+
+ const float* aPtr = inputBuffer;
+ __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
+
+ __m256 squareAccumulator = _mm256_setzero_ps();
+ __m256 aVal1, aVal2, aVal3, aVal4;
+ __m256 cVal1, cVal2, cVal3, cVal4;
+ for (; number < thirtySecondthPoints; number++) {
+ aVal1 = _mm256_loadu_ps(aPtr);
+ aPtr += 8;
+ cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
+
+ aVal2 = _mm256_loadu_ps(aPtr);
+ aPtr += 8;
+ cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
+
+ aVal3 = _mm256_loadu_ps(aPtr);
+ aPtr += 8;
+ cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
+
+ aVal4 = _mm256_loadu_ps(aPtr);
+ aPtr += 8;
+ cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
+
+ cVal1 = _mm256_or_ps(cVal1, cVal2);
+ cVal3 = _mm256_or_ps(cVal3, cVal4);
+ cVal1 = _mm256_or_ps(cVal1, cVal3);
+
+ squareAccumulator =
+ _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+ }
+ _mm256_storeu_ps(
+ squareBuffer,
+ squareAccumulator); // Store the results back into the C container
+ stdDev = squareBuffer[0];
+ stdDev += squareBuffer[1];
+ stdDev += squareBuffer[2];
+ stdDev += squareBuffer[3];
+ stdDev += squareBuffer[4];
+ stdDev += squareBuffer[5];
+ stdDev += squareBuffer[6];
+ stdDev += squareBuffer[7];
+
+ number = thirtySecondthPoints * 32;
+ for (; number < num_points; number++) {
+ stdDev += (*aPtr) * (*aPtr);
+ aPtr++;
+ }
+ stdDev /= num_points;
+ stdDev -= (mean * mean);
+ stdDev = sqrtf(stdDev);
}
- _mm256_storeu_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
- stdDev = squareBuffer[0];
- stdDev += squareBuffer[1];
- stdDev += squareBuffer[2];
- stdDev += squareBuffer[3];
- stdDev += squareBuffer[4];
- stdDev += squareBuffer[5];
- stdDev += squareBuffer[6];
- stdDev += squareBuffer[7];
-
- number = thirtySecondthPoints * 32;
- for(;number < num_points; number++){
- stdDev += (*aPtr) * (*aPtr);
- aPtr++;
- }
- stdDev /= num_points;
- stdDev -= (mean * mean);
- stdDev = sqrtf(stdDev);
- }
- *stddev = stdDev;
-
+ *stddev = stdDev;
}
#endif /* LV_HAVE_AVX */
* \endcode
*/
-#include <stdio.h>
-#include <math.h>
#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
#ifndef INCLUDED_volk_32f_sin_32f_a_H
#define INCLUDED_volk_32f_sin_32f_a_H
static inline void
volk_32f_sin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int eighthPoints = num_points / 8;
- unsigned int i = 0;
-
- __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
- __m256 sine, cosine, condition1, condition2;
- __m256i q, r, ones, twos, fours;
-
- m4pi = _mm256_set1_ps(1.273239545);
- pio4A = _mm256_set1_ps(0.78515625);
- pio4B = _mm256_set1_ps(0.241876e-3);
- ffours = _mm256_set1_ps(4.0);
- ftwos = _mm256_set1_ps(2.0);
- fones = _mm256_set1_ps(1.0);
- fzeroes = _mm256_setzero_ps();
- ones = _mm256_set1_epi32(1);
- twos = _mm256_set1_epi32(2);
- fours = _mm256_set1_epi32(4);
-
- cp1 = _mm256_set1_ps(1.0);
- cp2 = _mm256_set1_ps(0.83333333e-1);
- cp3 = _mm256_set1_ps(0.2777778e-2);
- cp4 = _mm256_set1_ps(0.49603e-4);
- cp5 = _mm256_set1_ps(0.551e-6);
-
- for(;number < eighthPoints; number++) {
- aVal = _mm256_load_ps(aPtr);
- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
- r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
-
- s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
- s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
-
- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
- s = _mm256_mul_ps(s, s);
- // Evaluate Taylor series
- s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
-
- for(i = 0; i < 3; i++) {
- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int eighthPoints = num_points / 8;
+ unsigned int i = 0;
+
+ __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+ fzeroes;
+ __m256 sine, cosine, condition1, condition2;
+ __m256i q, r, ones, twos, fours;
+
+ m4pi = _mm256_set1_ps(1.273239545);
+ pio4A = _mm256_set1_ps(0.78515625);
+ pio4B = _mm256_set1_ps(0.241876e-3);
+ ffours = _mm256_set1_ps(4.0);
+ ftwos = _mm256_set1_ps(2.0);
+ fones = _mm256_set1_ps(1.0);
+ fzeroes = _mm256_setzero_ps();
+ ones = _mm256_set1_epi32(1);
+ twos = _mm256_set1_epi32(2);
+ fours = _mm256_set1_epi32(4);
+
+ cp1 = _mm256_set1_ps(1.0);
+ cp2 = _mm256_set1_ps(0.83333333e-1);
+ cp3 = _mm256_set1_ps(0.2777778e-2);
+ cp4 = _mm256_set1_ps(0.49603e-4);
+ cp5 = _mm256_set1_ps(0.551e-6);
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_load_ps(aPtr);
+ s = _mm256_sub_ps(aVal,
+ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+ r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+
+ s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
+ s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
+
+ s = _mm256_div_ps(
+ s,
+ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+ s = _mm256_mul_ps(s, s);
+ // Evaluate Taylor series
+ s = _mm256_mul_ps(
+ _mm256_fmadd_ps(
+ _mm256_fmsub_ps(
+ _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
+ s,
+ cp1),
+ s);
+
+ for (i = 0; i < 3; i++) {
+ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+ }
+ s = _mm256_div_ps(s, ftwos);
+
+ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+ cosine = _mm256_sub_ps(fones, s);
+
+ condition1 = _mm256_cmp_ps(
+ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
+ fzeroes,
+ _CMP_NEQ_UQ);
+ condition2 = _mm256_cmp_ps(
+ _mm256_cmp_ps(
+ _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
+ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
+ _CMP_NEQ_UQ);
+ // Need this condition only for cos
+ // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
+ // twos), fours)), fzeroes);
+
+ sine =
+ _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
+ sine = _mm256_sub_ps(
+ sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+ _mm256_store_ps(bPtr, sine);
+ aPtr += 8;
+ bPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = sin(*aPtr++);
}
- s = _mm256_div_ps(s, ftwos);
-
- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
- cosine = _mm256_sub_ps(fones, s);
-
- condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ);
- condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ);
- // Need this condition only for cos
- //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
-
- sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
- sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
- _mm256_store_ps(bPtr, sine);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++) {
- *bPtr++ = sin(*aPtr++);
- }
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
static inline void
volk_32f_sin_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int eighthPoints = num_points / 8;
- unsigned int i = 0;
-
- __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
- __m256 sine, cosine, condition1, condition2;
- __m256i q, r, ones, twos, fours;
-
- m4pi = _mm256_set1_ps(1.273239545);
- pio4A = _mm256_set1_ps(0.78515625);
- pio4B = _mm256_set1_ps(0.241876e-3);
- ffours = _mm256_set1_ps(4.0);
- ftwos = _mm256_set1_ps(2.0);
- fones = _mm256_set1_ps(1.0);
- fzeroes = _mm256_setzero_ps();
- ones = _mm256_set1_epi32(1);
- twos = _mm256_set1_epi32(2);
- fours = _mm256_set1_epi32(4);
-
- cp1 = _mm256_set1_ps(1.0);
- cp2 = _mm256_set1_ps(0.83333333e-1);
- cp3 = _mm256_set1_ps(0.2777778e-2);
- cp4 = _mm256_set1_ps(0.49603e-4);
- cp5 = _mm256_set1_ps(0.551e-6);
-
- for(;number < eighthPoints; number++) {
- aVal = _mm256_load_ps(aPtr);
- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
- r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
-
- s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
- s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
-
- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
- s = _mm256_mul_ps(s, s);
- // Evaluate Taylor series
- s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
- for(i = 0; i < 3; i++) {
- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int eighthPoints = num_points / 8;
+ unsigned int i = 0;
+
+ __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+ fzeroes;
+ __m256 sine, cosine, condition1, condition2;
+ __m256i q, r, ones, twos, fours;
+
+ m4pi = _mm256_set1_ps(1.273239545);
+ pio4A = _mm256_set1_ps(0.78515625);
+ pio4B = _mm256_set1_ps(0.241876e-3);
+ ffours = _mm256_set1_ps(4.0);
+ ftwos = _mm256_set1_ps(2.0);
+ fones = _mm256_set1_ps(1.0);
+ fzeroes = _mm256_setzero_ps();
+ ones = _mm256_set1_epi32(1);
+ twos = _mm256_set1_epi32(2);
+ fours = _mm256_set1_epi32(4);
+
+ cp1 = _mm256_set1_ps(1.0);
+ cp2 = _mm256_set1_ps(0.83333333e-1);
+ cp3 = _mm256_set1_ps(0.2777778e-2);
+ cp4 = _mm256_set1_ps(0.49603e-4);
+ cp5 = _mm256_set1_ps(0.551e-6);
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_load_ps(aPtr);
+ s = _mm256_sub_ps(aVal,
+ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+ r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+
+ s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
+ s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
+
+ s = _mm256_div_ps(
+ s,
+ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+ s = _mm256_mul_ps(s, s);
+ // Evaluate Taylor series
+ s = _mm256_mul_ps(
+ _mm256_add_ps(
+ _mm256_mul_ps(
+ _mm256_sub_ps(
+ _mm256_mul_ps(
+ _mm256_add_ps(
+ _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
+ s),
+ cp3),
+ s),
+ cp2),
+ s),
+ cp1),
+ s);
+
+ for (i = 0; i < 3; i++) {
+ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+ }
+ s = _mm256_div_ps(s, ftwos);
+
+ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+ cosine = _mm256_sub_ps(fones, s);
+
+ condition1 = _mm256_cmp_ps(
+ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
+ fzeroes,
+ _CMP_NEQ_UQ);
+ condition2 = _mm256_cmp_ps(
+ _mm256_cmp_ps(
+ _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
+ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
+ _CMP_NEQ_UQ);
+ // Need this condition only for cos
+ // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
+ // twos), fours)), fzeroes);
+
+ sine =
+ _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
+ sine = _mm256_sub_ps(
+ sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+ _mm256_store_ps(bPtr, sine);
+ aPtr += 8;
+ bPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = sin(*aPtr++);
}
- s = _mm256_div_ps(s, ftwos);
-
- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
- cosine = _mm256_sub_ps(fones, s);
-
- condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ);
- condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ);
- // Need this condition only for cos
- //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
-
- sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
- sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
- _mm256_store_ps(bPtr, sine);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++) {
- *bPtr++ = sin(*aPtr++);
- }
}
#endif /* LV_HAVE_AVX2 for aligned */
static inline void
volk_32f_sin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int quarterPoints = num_points / 4;
- unsigned int i = 0;
-
- __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
- __m128 sine, cosine, condition1, condition2;
- __m128i q, r, ones, twos, fours;
-
- m4pi = _mm_set1_ps(1.273239545);
- pio4A = _mm_set1_ps(0.78515625);
- pio4B = _mm_set1_ps(0.241876e-3);
- ffours = _mm_set1_ps(4.0);
- ftwos = _mm_set1_ps(2.0);
- fones = _mm_set1_ps(1.0);
- fzeroes = _mm_setzero_ps();
- ones = _mm_set1_epi32(1);
- twos = _mm_set1_epi32(2);
- fours = _mm_set1_epi32(4);
-
- cp1 = _mm_set1_ps(1.0);
- cp2 = _mm_set1_ps(0.83333333e-1);
- cp3 = _mm_set1_ps(0.2777778e-2);
- cp4 = _mm_set1_ps(0.49603e-4);
- cp5 = _mm_set1_ps(0.551e-6);
-
- for(;number < quarterPoints; number++) {
- aVal = _mm_load_ps(aPtr);
- s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
- q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
- r = _mm_add_epi32(q, _mm_and_si128(q, ones));
-
- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
-
- s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
- s = _mm_mul_ps(s, s);
- // Evaluate Taylor series
- s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
- for(i = 0; i < 3; i++) {
- s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ unsigned int i = 0;
+
+ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+ fzeroes;
+ __m128 sine, cosine, condition1, condition2;
+ __m128i q, r, ones, twos, fours;
+
+ m4pi = _mm_set1_ps(1.273239545);
+ pio4A = _mm_set1_ps(0.78515625);
+ pio4B = _mm_set1_ps(0.241876e-3);
+ ffours = _mm_set1_ps(4.0);
+ ftwos = _mm_set1_ps(2.0);
+ fones = _mm_set1_ps(1.0);
+ fzeroes = _mm_setzero_ps();
+ ones = _mm_set1_epi32(1);
+ twos = _mm_set1_epi32(2);
+ fours = _mm_set1_epi32(4);
+
+ cp1 = _mm_set1_ps(1.0);
+ cp2 = _mm_set1_ps(0.83333333e-1);
+ cp3 = _mm_set1_ps(0.2777778e-2);
+ cp4 = _mm_set1_ps(0.49603e-4);
+ cp5 = _mm_set1_ps(0.551e-6);
+
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr);
+ s = _mm_sub_ps(aVal,
+ _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+ q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+ r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+ s = _mm_div_ps(
+ s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+ s = _mm_mul_ps(s, s);
+ // Evaluate Taylor series
+ s = _mm_mul_ps(
+ _mm_add_ps(
+ _mm_mul_ps(
+ _mm_sub_ps(
+ _mm_mul_ps(
+ _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
+ cp3),
+ s),
+ cp2),
+ s),
+ cp1),
+ s);
+
+ for (i = 0; i < 3; i++) {
+ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+ }
+ s = _mm_div_ps(s, ftwos);
+
+ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+ cosine = _mm_sub_ps(fones, s);
+
+ condition1 = _mm_cmpneq_ps(
+ _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+ condition2 = _mm_cmpneq_ps(
+ _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
+ _mm_cmplt_ps(aVal, fzeroes));
+ // Need this condition only for cos
+ // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
+ // twos), fours)), fzeroes);
+
+ sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
+ sine =
+ _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
+ _mm_store_ps(bPtr, sine);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *bPtr++ = sinf(*aPtr++);
}
- s = _mm_div_ps(s, ftwos);
-
- sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
- cosine = _mm_sub_ps(fones, s);
-
- condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
- condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
- // Need this condition only for cos
- //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
-
- sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
- sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
- _mm_store_ps(bPtr, sine);
- aPtr += 4;
- bPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++) {
- *bPtr++ = sinf(*aPtr++);
- }
}
#endif /* LV_HAVE_SSE4_1 for aligned */
static inline void
volk_32f_sin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int eighthPoints = num_points / 8;
- unsigned int i = 0;
-
- __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
- __m256 sine, cosine, condition1, condition2;
- __m256i q, r, ones, twos, fours;
-
- m4pi = _mm256_set1_ps(1.273239545);
- pio4A = _mm256_set1_ps(0.78515625);
- pio4B = _mm256_set1_ps(0.241876e-3);
- ffours = _mm256_set1_ps(4.0);
- ftwos = _mm256_set1_ps(2.0);
- fones = _mm256_set1_ps(1.0);
- fzeroes = _mm256_setzero_ps();
- ones = _mm256_set1_epi32(1);
- twos = _mm256_set1_epi32(2);
- fours = _mm256_set1_epi32(4);
-
- cp1 = _mm256_set1_ps(1.0);
- cp2 = _mm256_set1_ps(0.83333333e-1);
- cp3 = _mm256_set1_ps(0.2777778e-2);
- cp4 = _mm256_set1_ps(0.49603e-4);
- cp5 = _mm256_set1_ps(0.551e-6);
-
- for(;number < eighthPoints; number++) {
- aVal = _mm256_loadu_ps(aPtr);
- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
- r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
-
- s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
- s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
-
- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
- s = _mm256_mul_ps(s, s);
- // Evaluate Taylor series
- s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
-
- for(i = 0; i < 3; i++) {
- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int eighthPoints = num_points / 8;
+ unsigned int i = 0;
+
+ __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+ fzeroes;
+ __m256 sine, cosine, condition1, condition2;
+ __m256i q, r, ones, twos, fours;
+
+ m4pi = _mm256_set1_ps(1.273239545);
+ pio4A = _mm256_set1_ps(0.78515625);
+ pio4B = _mm256_set1_ps(0.241876e-3);
+ ffours = _mm256_set1_ps(4.0);
+ ftwos = _mm256_set1_ps(2.0);
+ fones = _mm256_set1_ps(1.0);
+ fzeroes = _mm256_setzero_ps();
+ ones = _mm256_set1_epi32(1);
+ twos = _mm256_set1_epi32(2);
+ fours = _mm256_set1_epi32(4);
+
+ cp1 = _mm256_set1_ps(1.0);
+ cp2 = _mm256_set1_ps(0.83333333e-1);
+ cp3 = _mm256_set1_ps(0.2777778e-2);
+ cp4 = _mm256_set1_ps(0.49603e-4);
+ cp5 = _mm256_set1_ps(0.551e-6);
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_loadu_ps(aPtr);
+ s = _mm256_sub_ps(aVal,
+ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+ r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+
+ s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
+ s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
+
+ s = _mm256_div_ps(
+ s,
+ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+ s = _mm256_mul_ps(s, s);
+ // Evaluate Taylor series
+ s = _mm256_mul_ps(
+ _mm256_fmadd_ps(
+ _mm256_fmsub_ps(
+ _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
+ s,
+ cp1),
+ s);
+
+ for (i = 0; i < 3; i++) {
+ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+ }
+ s = _mm256_div_ps(s, ftwos);
+
+ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+ cosine = _mm256_sub_ps(fones, s);
+
+ condition1 = _mm256_cmp_ps(
+ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
+ fzeroes,
+ _CMP_NEQ_UQ);
+ condition2 = _mm256_cmp_ps(
+ _mm256_cmp_ps(
+ _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
+ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
+ _CMP_NEQ_UQ);
+ // Need this condition only for cos
+ // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
+ // twos), fours)), fzeroes);
+
+ sine =
+ _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
+ sine = _mm256_sub_ps(
+ sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+ _mm256_storeu_ps(bPtr, sine);
+ aPtr += 8;
+ bPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = sin(*aPtr++);
}
- s = _mm256_div_ps(s, ftwos);
-
- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
- cosine = _mm256_sub_ps(fones, s);
-
- condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ);
- condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ);
- // Need this condition only for cos
- //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
-
- sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
- sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
- _mm256_storeu_ps(bPtr, sine);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++) {
- *bPtr++ = sin(*aPtr++);
- }
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
static inline void
volk_32f_sin_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int eighthPoints = num_points / 8;
- unsigned int i = 0;
-
- __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
- __m256 sine, cosine, condition1, condition2;
- __m256i q, r, ones, twos, fours;
-
- m4pi = _mm256_set1_ps(1.273239545);
- pio4A = _mm256_set1_ps(0.78515625);
- pio4B = _mm256_set1_ps(0.241876e-3);
- ffours = _mm256_set1_ps(4.0);
- ftwos = _mm256_set1_ps(2.0);
- fones = _mm256_set1_ps(1.0);
- fzeroes = _mm256_setzero_ps();
- ones = _mm256_set1_epi32(1);
- twos = _mm256_set1_epi32(2);
- fours = _mm256_set1_epi32(4);
-
- cp1 = _mm256_set1_ps(1.0);
- cp2 = _mm256_set1_ps(0.83333333e-1);
- cp3 = _mm256_set1_ps(0.2777778e-2);
- cp4 = _mm256_set1_ps(0.49603e-4);
- cp5 = _mm256_set1_ps(0.551e-6);
-
- for(;number < eighthPoints; number++) {
- aVal = _mm256_loadu_ps(aPtr);
- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
- r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
-
- s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
- s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
-
- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
- s = _mm256_mul_ps(s, s);
- // Evaluate Taylor series
- s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
- for(i = 0; i < 3; i++) {
- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int eighthPoints = num_points / 8;
+ unsigned int i = 0;
+
+ __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+ fzeroes;
+ __m256 sine, cosine, condition1, condition2;
+ __m256i q, r, ones, twos, fours;
+
+ m4pi = _mm256_set1_ps(1.273239545);
+ pio4A = _mm256_set1_ps(0.78515625);
+ pio4B = _mm256_set1_ps(0.241876e-3);
+ ffours = _mm256_set1_ps(4.0);
+ ftwos = _mm256_set1_ps(2.0);
+ fones = _mm256_set1_ps(1.0);
+ fzeroes = _mm256_setzero_ps();
+ ones = _mm256_set1_epi32(1);
+ twos = _mm256_set1_epi32(2);
+ fours = _mm256_set1_epi32(4);
+
+ cp1 = _mm256_set1_ps(1.0);
+ cp2 = _mm256_set1_ps(0.83333333e-1);
+ cp3 = _mm256_set1_ps(0.2777778e-2);
+ cp4 = _mm256_set1_ps(0.49603e-4);
+ cp5 = _mm256_set1_ps(0.551e-6);
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_loadu_ps(aPtr);
+ s = _mm256_sub_ps(aVal,
+ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+ r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+
+ s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
+ s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
+
+ s = _mm256_div_ps(
+ s,
+ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+ s = _mm256_mul_ps(s, s);
+ // Evaluate Taylor series
+ s = _mm256_mul_ps(
+ _mm256_add_ps(
+ _mm256_mul_ps(
+ _mm256_sub_ps(
+ _mm256_mul_ps(
+ _mm256_add_ps(
+ _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
+ s),
+ cp3),
+ s),
+ cp2),
+ s),
+ cp1),
+ s);
+
+ for (i = 0; i < 3; i++) {
+ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+ }
+ s = _mm256_div_ps(s, ftwos);
+
+ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+ cosine = _mm256_sub_ps(fones, s);
+
+ condition1 = _mm256_cmp_ps(
+ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
+ fzeroes,
+ _CMP_NEQ_UQ);
+ condition2 = _mm256_cmp_ps(
+ _mm256_cmp_ps(
+ _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
+ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
+ _CMP_NEQ_UQ);
+ // Need this condition only for cos
+ // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
+ // twos), fours)), fzeroes);
+
+ sine =
+ _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
+ sine = _mm256_sub_ps(
+ sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+ _mm256_storeu_ps(bPtr, sine);
+ aPtr += 8;
+ bPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = sin(*aPtr++);
}
- s = _mm256_div_ps(s, ftwos);
-
- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
- cosine = _mm256_sub_ps(fones, s);
-
- condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ);
- condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ);
- // Need this condition only for cos
- //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
-
- sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
- sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
- _mm256_storeu_ps(bPtr, sine);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++) {
- *bPtr++ = sin(*aPtr++);
- }
}
#endif /* LV_HAVE_AVX2 for unaligned */
static inline void
volk_32f_sin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int quarterPoints = num_points / 4;
- unsigned int i = 0;
-
- __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
- __m128 sine, cosine, condition1, condition2;
- __m128i q, r, ones, twos, fours;
-
- m4pi = _mm_set1_ps(1.273239545);
- pio4A = _mm_set1_ps(0.78515625);
- pio4B = _mm_set1_ps(0.241876e-3);
- ffours = _mm_set1_ps(4.0);
- ftwos = _mm_set1_ps(2.0);
- fones = _mm_set1_ps(1.0);
- fzeroes = _mm_setzero_ps();
- ones = _mm_set1_epi32(1);
- twos = _mm_set1_epi32(2);
- fours = _mm_set1_epi32(4);
-
- cp1 = _mm_set1_ps(1.0);
- cp2 = _mm_set1_ps(0.83333333e-1);
- cp3 = _mm_set1_ps(0.2777778e-2);
- cp4 = _mm_set1_ps(0.49603e-4);
- cp5 = _mm_set1_ps(0.551e-6);
-
- for(;number < quarterPoints; number++) {
- aVal = _mm_loadu_ps(aPtr);
- s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
- q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
- r = _mm_add_epi32(q, _mm_and_si128(q, ones));
-
- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
-
- s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
- s = _mm_mul_ps(s, s);
- // Evaluate Taylor series
- s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
- for(i = 0; i < 3; i++) {
- s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
- }
- s = _mm_div_ps(s, ftwos);
-
- sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
- cosine = _mm_sub_ps(fones, s);
-
- condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
- condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
- sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
- sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
- _mm_storeu_ps(bPtr, sine);
- aPtr += 4;
- bPtr += 4;
- }
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ unsigned int i = 0;
+
+ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+ fzeroes;
+ __m128 sine, cosine, condition1, condition2;
+ __m128i q, r, ones, twos, fours;
+
+ m4pi = _mm_set1_ps(1.273239545);
+ pio4A = _mm_set1_ps(0.78515625);
+ pio4B = _mm_set1_ps(0.241876e-3);
+ ffours = _mm_set1_ps(4.0);
+ ftwos = _mm_set1_ps(2.0);
+ fones = _mm_set1_ps(1.0);
+ fzeroes = _mm_setzero_ps();
+ ones = _mm_set1_epi32(1);
+ twos = _mm_set1_epi32(2);
+ fours = _mm_set1_epi32(4);
+
+ cp1 = _mm_set1_ps(1.0);
+ cp2 = _mm_set1_ps(0.83333333e-1);
+ cp3 = _mm_set1_ps(0.2777778e-2);
+ cp4 = _mm_set1_ps(0.49603e-4);
+ cp5 = _mm_set1_ps(0.551e-6);
+
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_loadu_ps(aPtr);
+ s = _mm_sub_ps(aVal,
+ _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+ q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+ r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+ s = _mm_div_ps(
+ s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+ s = _mm_mul_ps(s, s);
+ // Evaluate Taylor series
+ s = _mm_mul_ps(
+ _mm_add_ps(
+ _mm_mul_ps(
+ _mm_sub_ps(
+ _mm_mul_ps(
+ _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
+ cp3),
+ s),
+ cp2),
+ s),
+ cp1),
+ s);
+
+ for (i = 0; i < 3; i++) {
+ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+ }
+ s = _mm_div_ps(s, ftwos);
+
+ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+ cosine = _mm_sub_ps(fones, s);
+
+ condition1 = _mm_cmpneq_ps(
+ _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+ condition2 = _mm_cmpneq_ps(
+ _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
+ _mm_cmplt_ps(aVal, fzeroes));
+
+ sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
+ sine =
+ _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
+ _mm_storeu_ps(bPtr, sine);
+ aPtr += 4;
+ bPtr += 4;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = sinf(*aPtr++);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *bPtr++ = sinf(*aPtr++);
+ }
}
#endif /* LV_HAVE_SSE4_1 for unaligned */
static inline void
volk_32f_sin_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++) {
- *bPtr++ = sinf(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+ for (number = 0; number < num_points; number++) {
+ *bPtr++ = sinf(*aPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#include <volk/volk_neon_intrinsics.h>
static inline void
-volk_32f_sin_32f_neon(float* bVector, const float* aVector,
- unsigned int num_points)
+volk_32f_sin_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
{
unsigned int number = 0;
unsigned int quarter_points = num_points / 4;
float* bVectorPtr = bVector;
const float* aVectorPtr = aVector;
-
+
float32x4_t b_vec;
float32x4_t a_vec;
-
- for(number = 0; number < quarter_points; number++) {
+
+ for (number = 0; number < quarter_points; number++) {
a_vec = vld1q_f32(aVectorPtr);
// Prefetch next one, speeds things up
- __VOLK_PREFETCH(aVectorPtr+4);
+ __VOLK_PREFETCH(aVectorPtr + 4);
b_vec = _vsinq_f32(a_vec);
vst1q_f32(bVectorPtr, b_vec);
// move pointers ahead
- bVectorPtr+=4;
- aVectorPtr+=4;
+ bVectorPtr += 4;
+ aVectorPtr += 4;
}
-
+
// Deal with the rest
- for(number = quarter_points * 4; number < num_points; number++) {
+ for (number = quarter_points * 4; number < num_points; number++) {
*bVectorPtr++ = sinf(*aVectorPtr++);
}
}
#define INCLUDED_volk_32f_sqrt_32f_a_H
#include <inttypes.h>
-#include <stdio.h>
#include <math.h>
+#include <stdio.h>
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
static inline void
volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- float* cPtr = cVector;
- const float* aPtr = aVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
- __m128 aVal, cVal;
- for(;number < quarterPoints; number++) {
- aVal = _mm_load_ps(aPtr);
+ __m128 aVal, cVal;
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr);
- cVal = _mm_sqrt_ps(aVal);
+ cVal = _mm_sqrt_ps(aVal);
- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ cPtr += 4;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++) {
- *cPtr++ = sqrtf(*aPtr++);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *cPtr++ = sqrtf(*aPtr++);
+ }
}
#endif /* LV_HAVE_SSE */
static inline void
volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- float* cPtr = cVector;
- const float* aPtr = aVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
- __m256 aVal, cVal;
- for(;number < eighthPoints; number++) {
- aVal = _mm256_load_ps(aPtr);
+ __m256 aVal, cVal;
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_load_ps(aPtr);
- cVal = _mm256_sqrt_ps(aVal);
+ cVal = _mm256_sqrt_ps(aVal);
- _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ cPtr += 8;
+ }
- number = eighthPoints * 8;
- for(;number < num_points; number++) {
- *cPtr++ = sqrtf(*aPtr++);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *cPtr++ = sqrtf(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
static inline void
volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
{
- float* cPtr = cVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
- unsigned int quarter_points = num_points / 4;
- float32x4_t in_vec, out_vec;
-
- for(number = 0; number < quarter_points; number++) {
- in_vec = vld1q_f32(aPtr);
- // note that armv8 has vsqrt_f32 which will be much better
- out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec) );
- vst1q_f32(cPtr, out_vec);
- aPtr += 4;
- cPtr += 4;
- }
-
- for(number = quarter_points * 4; number < num_points; number++) {
- *cPtr++ = sqrtf(*aPtr++);
- }
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+ float32x4_t in_vec, out_vec;
+
+ for (number = 0; number < quarter_points; number++) {
+ in_vec = vld1q_f32(aPtr);
+ // note that armv8 has vsqrt_f32 which will be much better
+ out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec));
+ vst1q_f32(cPtr, out_vec);
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ for (number = quarter_points * 4; number < num_points; number++) {
+ *cPtr++ = sqrtf(*aPtr++);
+ }
}
#endif /* LV_HAVE_NEON */
static inline void
volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
{
- float* cPtr = cVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
- for(number = 0; number < num_points; number++) {
- *cPtr++ = sqrtf(*aPtr++);
- }
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = sqrtf(*aPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC
-extern void
-volk_32f_sqrt_32f_a_orc_impl(float *, const float*, unsigned int);
+extern void volk_32f_sqrt_32f_a_orc_impl(float*, const float*, unsigned int);
static inline void
volk_32f_sqrt_32f_u_orc(float* cVector, const float* aVector, unsigned int num_points)
{
- volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points);
+ volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points);
}
#endif /* LV_HAVE_ORC */
#define INCLUDED_volk_32f_sqrt_32f_u_H
#include <inttypes.h>
-#include <stdio.h>
#include <math.h>
+#include <stdio.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
static inline void
volk_32f_sqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- float* cPtr = cVector;
- const float* aPtr = aVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
- __m256 aVal, cVal;
- for(;number < eighthPoints; number++) {
- aVal = _mm256_loadu_ps(aPtr);
+ __m256 aVal, cVal;
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_loadu_ps(aPtr);
- cVal = _mm256_sqrt_ps(aVal);
+ cVal = _mm256_sqrt_ps(aVal);
- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ cPtr += 8;
+ }
- number = eighthPoints * 8;
- for(;number < num_points; number++) {
- *cPtr++ = sqrtf(*aPtr++);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *cPtr++ = sqrtf(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_stddev_and_mean_32f_x2(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points)
- * \endcode
+ * void volk_32f_stddev_and_mean_32f_x2(float* stddev, float* mean, const float*
+ * inputBuffer, unsigned int num_points) \endcode
*
* \b Inputs
* \li inputBuffer: The buffer of points.
* \li mean: The mean of the input buffer.
*
* \b Example
- * Generate random numbers with c++11's normal distribution and estimate the mean and standard deviation
- * \code
- * int N = 1000;
- * unsigned int alignment = volk_get_alignment();
+ * Generate random numbers with c++11's normal distribution and estimate the mean and
+ * standard deviation \code int N = 1000; unsigned int alignment = volk_get_alignment();
* float* rand_numbers = (float*)volk_malloc(sizeof(float)*N, alignment);
* float* mean = (float*)volk_malloc(sizeof(float), alignment);
* float* stddev = (float*)volk_malloc(sizeof(float), alignment);
#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
#define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
-#include <volk/volk_common.h>
#include <inttypes.h>
-#include <stdio.h>
#include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev, float* mean,
- const float* inputBuffer,
- unsigned int num_points)
+static inline void volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev,
+ float* mean,
+ const float* inputBuffer,
+ unsigned int num_points)
{
- float stdDev = 0;
- float newMean = 0;
- if(num_points > 0){
- unsigned int number = 0;
- const unsigned int thirtySecondthPoints = num_points / 32;
-
- const float* aPtr = inputBuffer;
- __VOLK_ATTR_ALIGNED(32) float meanBuffer[8];
- __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
-
- __m256 accumulator = _mm256_setzero_ps();
- __m256 squareAccumulator = _mm256_setzero_ps();
- __m256 aVal1, aVal2, aVal3, aVal4;
- __m256 cVal1, cVal2, cVal3, cVal4;
- for(;number < thirtySecondthPoints; number++) {
- aVal1 = _mm256_load_ps(aPtr); aPtr += 8;
- cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
- accumulator = _mm256_add_ps(accumulator, aVal1); // accumulator += x
-
- aVal2 = _mm256_load_ps(aPtr); aPtr += 8;
- cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
- accumulator = _mm256_add_ps(accumulator, aVal2); // accumulator += x
-
- aVal3 = _mm256_load_ps(aPtr); aPtr += 8;
- cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
- accumulator = _mm256_add_ps(accumulator, aVal3); // accumulator += x
-
- aVal4 = _mm256_load_ps(aPtr); aPtr += 8;
- cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
- accumulator = _mm256_add_ps(accumulator, aVal4); // accumulator += x
-
- cVal1 = _mm256_or_ps(cVal1, cVal2);
- cVal3 = _mm256_or_ps(cVal3, cVal4);
- cVal1 = _mm256_or_ps(cVal1, cVal3);
-
- squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
- }
- _mm256_store_ps(meanBuffer,accumulator); // Store the results back into the C container
- _mm256_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
- newMean = meanBuffer[0];
- newMean += meanBuffer[1];
- newMean += meanBuffer[2];
- newMean += meanBuffer[3];
- newMean += meanBuffer[4];
- newMean += meanBuffer[5];
- newMean += meanBuffer[6];
- newMean += meanBuffer[7];
- stdDev = squareBuffer[0];
- stdDev += squareBuffer[1];
- stdDev += squareBuffer[2];
- stdDev += squareBuffer[3];
- stdDev += squareBuffer[4];
- stdDev += squareBuffer[5];
- stdDev += squareBuffer[6];
- stdDev += squareBuffer[7];
-
- number = thirtySecondthPoints * 32;
- for(;number < num_points; number++){
- stdDev += (*aPtr) * (*aPtr);
- newMean += *aPtr++;
+ float stdDev = 0;
+ float newMean = 0;
+ if (num_points > 0) {
+ unsigned int number = 0;
+ const unsigned int thirtySecondthPoints = num_points / 32;
+
+ const float* aPtr = inputBuffer;
+ __VOLK_ATTR_ALIGNED(32) float meanBuffer[8];
+ __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
+
+ __m256 accumulator = _mm256_setzero_ps();
+ __m256 squareAccumulator = _mm256_setzero_ps();
+ __m256 aVal1, aVal2, aVal3, aVal4;
+ __m256 cVal1, cVal2, cVal3, cVal4;
+ for (; number < thirtySecondthPoints; number++) {
+ aVal1 = _mm256_load_ps(aPtr);
+ aPtr += 8;
+ cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
+ accumulator = _mm256_add_ps(accumulator, aVal1); // accumulator += x
+
+ aVal2 = _mm256_load_ps(aPtr);
+ aPtr += 8;
+ cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
+ accumulator = _mm256_add_ps(accumulator, aVal2); // accumulator += x
+
+ aVal3 = _mm256_load_ps(aPtr);
+ aPtr += 8;
+ cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
+ accumulator = _mm256_add_ps(accumulator, aVal3); // accumulator += x
+
+ aVal4 = _mm256_load_ps(aPtr);
+ aPtr += 8;
+ cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
+ accumulator = _mm256_add_ps(accumulator, aVal4); // accumulator += x
+
+ cVal1 = _mm256_or_ps(cVal1, cVal2);
+ cVal3 = _mm256_or_ps(cVal3, cVal4);
+ cVal1 = _mm256_or_ps(cVal1, cVal3);
+
+ squareAccumulator =
+ _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+ }
+ _mm256_store_ps(meanBuffer,
+ accumulator); // Store the results back into the C container
+ _mm256_store_ps(squareBuffer,
+ squareAccumulator); // Store the results back into the C container
+ newMean = meanBuffer[0];
+ newMean += meanBuffer[1];
+ newMean += meanBuffer[2];
+ newMean += meanBuffer[3];
+ newMean += meanBuffer[4];
+ newMean += meanBuffer[5];
+ newMean += meanBuffer[6];
+ newMean += meanBuffer[7];
+ stdDev = squareBuffer[0];
+ stdDev += squareBuffer[1];
+ stdDev += squareBuffer[2];
+ stdDev += squareBuffer[3];
+ stdDev += squareBuffer[4];
+ stdDev += squareBuffer[5];
+ stdDev += squareBuffer[6];
+ stdDev += squareBuffer[7];
+
+ number = thirtySecondthPoints * 32;
+ for (; number < num_points; number++) {
+ stdDev += (*aPtr) * (*aPtr);
+ newMean += *aPtr++;
+ }
+ newMean /= num_points;
+ stdDev /= num_points;
+ stdDev -= (newMean * newMean);
+ stdDev = sqrtf(stdDev);
}
- newMean /= num_points;
- stdDev /= num_points;
- stdDev -= (newMean * newMean);
- stdDev = sqrtf(stdDev);
- }
- *stddev = stdDev;
- *mean = newMean;
-
+ *stddev = stdDev;
+ *mean = newMean;
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_stddev_and_mean_32f_x2_u_avx(float* stddev, float* mean,
- const float* inputBuffer,
- unsigned int num_points)
+static inline void volk_32f_stddev_and_mean_32f_x2_u_avx(float* stddev,
+ float* mean,
+ const float* inputBuffer,
+ unsigned int num_points)
{
- float stdDev = 0;
- float newMean = 0;
- if(num_points > 0){
- unsigned int number = 0;
- const unsigned int thirtySecondthPoints = num_points / 32;
-
- const float* aPtr = inputBuffer;
- __VOLK_ATTR_ALIGNED(32) float meanBuffer[8];
- __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
-
- __m256 accumulator = _mm256_setzero_ps();
- __m256 squareAccumulator = _mm256_setzero_ps();
- __m256 aVal1, aVal2, aVal3, aVal4;
- __m256 cVal1, cVal2, cVal3, cVal4;
- for(;number < thirtySecondthPoints; number++) {
- aVal1 = _mm256_loadu_ps(aPtr); aPtr += 8;
- cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
- accumulator = _mm256_add_ps(accumulator, aVal1); // accumulator += x
-
- aVal2 = _mm256_loadu_ps(aPtr); aPtr += 8;
- cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
- accumulator = _mm256_add_ps(accumulator, aVal2); // accumulator += x
-
- aVal3 = _mm256_loadu_ps(aPtr); aPtr += 8;
- cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
- accumulator = _mm256_add_ps(accumulator, aVal3); // accumulator += x
-
- aVal4 = _mm256_loadu_ps(aPtr); aPtr += 8;
- cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
- accumulator = _mm256_add_ps(accumulator, aVal4); // accumulator += x
-
- cVal1 = _mm256_or_ps(cVal1, cVal2);
- cVal3 = _mm256_or_ps(cVal3, cVal4);
- cVal1 = _mm256_or_ps(cVal1, cVal3);
-
- squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
- }
- _mm256_store_ps(meanBuffer,accumulator); // Store the results back into the C container
- _mm256_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
- newMean = meanBuffer[0];
- newMean += meanBuffer[1];
- newMean += meanBuffer[2];
- newMean += meanBuffer[3];
- newMean += meanBuffer[4];
- newMean += meanBuffer[5];
- newMean += meanBuffer[6];
- newMean += meanBuffer[7];
- stdDev = squareBuffer[0];
- stdDev += squareBuffer[1];
- stdDev += squareBuffer[2];
- stdDev += squareBuffer[3];
- stdDev += squareBuffer[4];
- stdDev += squareBuffer[5];
- stdDev += squareBuffer[6];
- stdDev += squareBuffer[7];
-
- number = thirtySecondthPoints * 32;
- for(;number < num_points; number++){
- stdDev += (*aPtr) * (*aPtr);
- newMean += *aPtr++;
+ float stdDev = 0;
+ float newMean = 0;
+ if (num_points > 0) {
+ unsigned int number = 0;
+ const unsigned int thirtySecondthPoints = num_points / 32;
+
+ const float* aPtr = inputBuffer;
+ __VOLK_ATTR_ALIGNED(32) float meanBuffer[8];
+ __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
+
+ __m256 accumulator = _mm256_setzero_ps();
+ __m256 squareAccumulator = _mm256_setzero_ps();
+ __m256 aVal1, aVal2, aVal3, aVal4;
+ __m256 cVal1, cVal2, cVal3, cVal4;
+ for (; number < thirtySecondthPoints; number++) {
+ aVal1 = _mm256_loadu_ps(aPtr);
+ aPtr += 8;
+ cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
+ accumulator = _mm256_add_ps(accumulator, aVal1); // accumulator += x
+
+ aVal2 = _mm256_loadu_ps(aPtr);
+ aPtr += 8;
+ cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
+ accumulator = _mm256_add_ps(accumulator, aVal2); // accumulator += x
+
+ aVal3 = _mm256_loadu_ps(aPtr);
+ aPtr += 8;
+ cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
+ accumulator = _mm256_add_ps(accumulator, aVal3); // accumulator += x
+
+ aVal4 = _mm256_loadu_ps(aPtr);
+ aPtr += 8;
+ cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
+ accumulator = _mm256_add_ps(accumulator, aVal4); // accumulator += x
+
+ cVal1 = _mm256_or_ps(cVal1, cVal2);
+ cVal3 = _mm256_or_ps(cVal3, cVal4);
+ cVal1 = _mm256_or_ps(cVal1, cVal3);
+
+ squareAccumulator =
+ _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+ }
+ _mm256_store_ps(meanBuffer,
+ accumulator); // Store the results back into the C container
+ _mm256_store_ps(squareBuffer,
+ squareAccumulator); // Store the results back into the C container
+ newMean = meanBuffer[0];
+ newMean += meanBuffer[1];
+ newMean += meanBuffer[2];
+ newMean += meanBuffer[3];
+ newMean += meanBuffer[4];
+ newMean += meanBuffer[5];
+ newMean += meanBuffer[6];
+ newMean += meanBuffer[7];
+ stdDev = squareBuffer[0];
+ stdDev += squareBuffer[1];
+ stdDev += squareBuffer[2];
+ stdDev += squareBuffer[3];
+ stdDev += squareBuffer[4];
+ stdDev += squareBuffer[5];
+ stdDev += squareBuffer[6];
+ stdDev += squareBuffer[7];
+
+ number = thirtySecondthPoints * 32;
+ for (; number < num_points; number++) {
+ stdDev += (*aPtr) * (*aPtr);
+ newMean += *aPtr++;
+ }
+ newMean /= num_points;
+ stdDev /= num_points;
+ stdDev -= (newMean * newMean);
+ stdDev = sqrtf(stdDev);
}
- newMean /= num_points;
- stdDev /= num_points;
- stdDev -= (newMean * newMean);
- stdDev = sqrtf(stdDev);
- }
- *stddev = stdDev;
- *mean = newMean;
-
+ *stddev = stdDev;
+ *mean = newMean;
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
-static inline void
-volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float* mean,
- const float* inputBuffer,
- unsigned int num_points)
+static inline void volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev,
+ float* mean,
+ const float* inputBuffer,
+ unsigned int num_points)
{
- float returnValue = 0;
- float newMean = 0;
- if(num_points > 0){
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- const float* aPtr = inputBuffer;
- __VOLK_ATTR_ALIGNED(16) float meanBuffer[4];
- __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
-
- __m128 accumulator = _mm_setzero_ps();
- __m128 squareAccumulator = _mm_setzero_ps();
- __m128 aVal1, aVal2, aVal3, aVal4;
- __m128 cVal1, cVal2, cVal3, cVal4;
- for(;number < sixteenthPoints; number++) {
- aVal1 = _mm_load_ps(aPtr); aPtr += 4;
- cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
- accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x
-
- aVal2 = _mm_load_ps(aPtr); aPtr += 4;
- cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
- accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x
-
- aVal3 = _mm_load_ps(aPtr); aPtr += 4;
- cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
- accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x
-
- aVal4 = _mm_load_ps(aPtr); aPtr += 4;
- cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
- accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x
-
- cVal1 = _mm_or_ps(cVal1, cVal2);
- cVal3 = _mm_or_ps(cVal3, cVal4);
- cVal1 = _mm_or_ps(cVal1, cVal3);
-
- squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
- }
- _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
- _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
- newMean = meanBuffer[0];
- newMean += meanBuffer[1];
- newMean += meanBuffer[2];
- newMean += meanBuffer[3];
- returnValue = squareBuffer[0];
- returnValue += squareBuffer[1];
- returnValue += squareBuffer[2];
- returnValue += squareBuffer[3];
-
- number = sixteenthPoints * 16;
- for(;number < num_points; number++){
- returnValue += (*aPtr) * (*aPtr);
- newMean += *aPtr++;
+ float returnValue = 0;
+ float newMean = 0;
+ if (num_points > 0) {
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const float* aPtr = inputBuffer;
+ __VOLK_ATTR_ALIGNED(16) float meanBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+
+ __m128 accumulator = _mm_setzero_ps();
+ __m128 squareAccumulator = _mm_setzero_ps();
+ __m128 aVal1, aVal2, aVal3, aVal4;
+ __m128 cVal1, cVal2, cVal3, cVal4;
+ for (; number < sixteenthPoints; number++) {
+ aVal1 = _mm_load_ps(aPtr);
+ aPtr += 4;
+ cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
+ accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x
+
+ aVal2 = _mm_load_ps(aPtr);
+ aPtr += 4;
+ cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
+ accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x
+
+ aVal3 = _mm_load_ps(aPtr);
+ aPtr += 4;
+ cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
+ accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x
+
+ aVal4 = _mm_load_ps(aPtr);
+ aPtr += 4;
+ cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
+ accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x
+
+ cVal1 = _mm_or_ps(cVal1, cVal2);
+ cVal3 = _mm_or_ps(cVal3, cVal4);
+ cVal1 = _mm_or_ps(cVal1, cVal3);
+
+ squareAccumulator =
+ _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+ }
+ _mm_store_ps(meanBuffer,
+ accumulator); // Store the results back into the C container
+ _mm_store_ps(squareBuffer,
+ squareAccumulator); // Store the results back into the C container
+ newMean = meanBuffer[0];
+ newMean += meanBuffer[1];
+ newMean += meanBuffer[2];
+ newMean += meanBuffer[3];
+ returnValue = squareBuffer[0];
+ returnValue += squareBuffer[1];
+ returnValue += squareBuffer[2];
+ returnValue += squareBuffer[3];
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ returnValue += (*aPtr) * (*aPtr);
+ newMean += *aPtr++;
+ }
+ newMean /= num_points;
+ returnValue /= num_points;
+ returnValue -= (newMean * newMean);
+ returnValue = sqrtf(returnValue);
}
- newMean /= num_points;
- returnValue /= num_points;
- returnValue -= (newMean * newMean);
- returnValue = sqrtf(returnValue);
- }
- *stddev = returnValue;
- *mean = newMean;
+ *stddev = returnValue;
+ *mean = newMean;
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, float* mean,
- const float* inputBuffer,
- unsigned int num_points)
+static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev,
+ float* mean,
+ const float* inputBuffer,
+ unsigned int num_points)
{
- float returnValue = 0;
- float newMean = 0;
- if(num_points > 0){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- const float* aPtr = inputBuffer;
- __VOLK_ATTR_ALIGNED(16) float meanBuffer[4];
- __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
-
- __m128 accumulator = _mm_setzero_ps();
- __m128 squareAccumulator = _mm_setzero_ps();
- __m128 aVal = _mm_setzero_ps();
- for(;number < quarterPoints; number++) {
- aVal = _mm_load_ps(aPtr); // aVal = x
- accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x
- aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
- squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
- aPtr += 4;
+ float returnValue = 0;
+ float newMean = 0;
+ if (num_points > 0) {
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* aPtr = inputBuffer;
+ __VOLK_ATTR_ALIGNED(16) float meanBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+
+ __m128 accumulator = _mm_setzero_ps();
+ __m128 squareAccumulator = _mm_setzero_ps();
+ __m128 aVal = _mm_setzero_ps();
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr); // aVal = x
+ accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x
+ aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
+ squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
+ aPtr += 4;
+ }
+ _mm_store_ps(meanBuffer,
+ accumulator); // Store the results back into the C container
+ _mm_store_ps(squareBuffer,
+ squareAccumulator); // Store the results back into the C container
+ newMean = meanBuffer[0];
+ newMean += meanBuffer[1];
+ newMean += meanBuffer[2];
+ newMean += meanBuffer[3];
+ returnValue = squareBuffer[0];
+ returnValue += squareBuffer[1];
+ returnValue += squareBuffer[2];
+ returnValue += squareBuffer[3];
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ returnValue += (*aPtr) * (*aPtr);
+ newMean += *aPtr++;
+ }
+ newMean /= num_points;
+ returnValue /= num_points;
+ returnValue -= (newMean * newMean);
+ returnValue = sqrtf(returnValue);
}
- _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
- _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
- newMean = meanBuffer[0];
- newMean += meanBuffer[1];
- newMean += meanBuffer[2];
- newMean += meanBuffer[3];
- returnValue = squareBuffer[0];
- returnValue += squareBuffer[1];
- returnValue += squareBuffer[2];
- returnValue += squareBuffer[3];
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- returnValue += (*aPtr) * (*aPtr);
- newMean += *aPtr++;
- }
- newMean /= num_points;
- returnValue /= num_points;
- returnValue -= (newMean * newMean);
- returnValue = sqrtf(returnValue);
- }
- *stddev = returnValue;
- *mean = newMean;
+ *stddev = returnValue;
+ *mean = newMean;
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_stddev_and_mean_32f_x2_generic(float* stddev, float* mean,
- const float* inputBuffer,
- unsigned int num_points)
+static inline void volk_32f_stddev_and_mean_32f_x2_generic(float* stddev,
+ float* mean,
+ const float* inputBuffer,
+ unsigned int num_points)
{
- float returnValue = 0;
- float newMean = 0;
- if(num_points > 0){
- const float* aPtr = inputBuffer;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- returnValue += (*aPtr) * (*aPtr);
- newMean += *aPtr++;
+ float returnValue = 0;
+ float newMean = 0;
+ if (num_points > 0) {
+ const float* aPtr = inputBuffer;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ returnValue += (*aPtr) * (*aPtr);
+ newMean += *aPtr++;
+ }
+ newMean /= num_points;
+ returnValue /= num_points;
+ returnValue -= (newMean * newMean);
+ returnValue = sqrtf(returnValue);
}
- newMean /= num_points;
- returnValue /= num_points;
- returnValue -= (newMean * newMean);
- returnValue = sqrtf(returnValue);
- }
- *stddev = returnValue;
- *mean = newMean;
+ *stddev = returnValue;
+ *mean = newMean;
}
#endif /* LV_HAVE_GENERIC */
-
-
#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */
* \endcode
*/
-#include <stdio.h>
-#include <math.h>
#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
#ifndef INCLUDED_volk_32f_tan_32f_a_H
#define INCLUDED_volk_32f_tan_32f_a_H
#include <immintrin.h>
static inline void
-volk_32f_tan_32f_a_avx2_fma(float* bVector, const float* aVector,
- unsigned int num_points)
+volk_32f_tan_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int eighthPoints = num_points / 8;
- unsigned int i = 0;
-
- __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
- __m256 sine, cosine, tangent, condition1, condition2, condition3;
- __m256i q, r, ones, twos, fours;
-
- m4pi = _mm256_set1_ps(1.273239545);
- pio4A = _mm256_set1_ps(0.78515625);
- pio4B = _mm256_set1_ps(0.241876e-3);
- ffours = _mm256_set1_ps(4.0);
- ftwos = _mm256_set1_ps(2.0);
- fones = _mm256_set1_ps(1.0);
- fzeroes = _mm256_setzero_ps();
- ones = _mm256_set1_epi32(1);
- twos = _mm256_set1_epi32(2);
- fours = _mm256_set1_epi32(4);
-
- cp1 = _mm256_set1_ps(1.0);
- cp2 = _mm256_set1_ps(0.83333333e-1);
- cp3 = _mm256_set1_ps(0.2777778e-2);
- cp4 = _mm256_set1_ps(0.49603e-4);
- cp5 = _mm256_set1_ps(0.551e-6);
-
- for(;number < eighthPoints; number++){
- aVal = _mm256_load_ps(aPtr);
- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
- r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
-
- s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
- s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
-
- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
- s = _mm256_mul_ps(s, s);
- // Evaluate Taylor series
- s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
-
- for(i = 0; i < 3; i++){
- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int eighthPoints = num_points / 8;
+ unsigned int i = 0;
+
+ __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+ fzeroes;
+ __m256 sine, cosine, tangent, condition1, condition2, condition3;
+ __m256i q, r, ones, twos, fours;
+
+ m4pi = _mm256_set1_ps(1.273239545);
+ pio4A = _mm256_set1_ps(0.78515625);
+ pio4B = _mm256_set1_ps(0.241876e-3);
+ ffours = _mm256_set1_ps(4.0);
+ ftwos = _mm256_set1_ps(2.0);
+ fones = _mm256_set1_ps(1.0);
+ fzeroes = _mm256_setzero_ps();
+ ones = _mm256_set1_epi32(1);
+ twos = _mm256_set1_epi32(2);
+ fours = _mm256_set1_epi32(4);
+
+ cp1 = _mm256_set1_ps(1.0);
+ cp2 = _mm256_set1_ps(0.83333333e-1);
+ cp3 = _mm256_set1_ps(0.2777778e-2);
+ cp4 = _mm256_set1_ps(0.49603e-4);
+ cp5 = _mm256_set1_ps(0.551e-6);
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_load_ps(aPtr);
+ s = _mm256_sub_ps(aVal,
+ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+ r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+
+ s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
+ s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
+
+ s = _mm256_div_ps(
+ s,
+ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+ s = _mm256_mul_ps(s, s);
+ // Evaluate Taylor series
+ s = _mm256_mul_ps(
+ _mm256_fmadd_ps(
+ _mm256_fmsub_ps(
+ _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
+ s,
+ cp1),
+ s);
+
+ for (i = 0; i < 3; i++) {
+ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+ }
+ s = _mm256_div_ps(s, ftwos);
+
+ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+ cosine = _mm256_sub_ps(fones, s);
+
+ condition1 = _mm256_cmp_ps(
+ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
+ fzeroes,
+ _CMP_NEQ_UQ);
+ condition2 = _mm256_cmp_ps(
+ _mm256_cmp_ps(
+ _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
+ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
+ _CMP_NEQ_UQ);
+ condition3 = _mm256_cmp_ps(
+ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
+ fzeroes,
+ _CMP_NEQ_UQ);
+
+ __m256 temp = cosine;
+ cosine =
+ _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
+ sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
+ sine = _mm256_sub_ps(
+ sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+ cosine = _mm256_sub_ps(
+ cosine,
+ _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
+ tangent = _mm256_div_ps(sine, cosine);
+ _mm256_store_ps(bPtr, tangent);
+ aPtr += 8;
+ bPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = tan(*aPtr++);
}
- s = _mm256_div_ps(s, ftwos);
-
- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
- cosine = _mm256_sub_ps(fones, s);
-
- condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ);
- condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ);
- condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes, _CMP_NEQ_UQ);
-
- __m256 temp = cosine;
- cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
- sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
- sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
- cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
- tangent = _mm256_div_ps(sine, cosine);
- _mm256_store_ps(bPtr, tangent);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = tan(*aPtr++);
- }
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
#include <immintrin.h>
static inline void
-volk_32f_tan_32f_a_avx2(float* bVector, const float* aVector,
- unsigned int num_points)
+volk_32f_tan_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int eighthPoints = num_points / 8;
- unsigned int i = 0;
-
- __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
- __m256 sine, cosine, tangent, condition1, condition2, condition3;
- __m256i q, r, ones, twos, fours;
-
- m4pi = _mm256_set1_ps(1.273239545);
- pio4A = _mm256_set1_ps(0.78515625);
- pio4B = _mm256_set1_ps(0.241876e-3);
- ffours = _mm256_set1_ps(4.0);
- ftwos = _mm256_set1_ps(2.0);
- fones = _mm256_set1_ps(1.0);
- fzeroes = _mm256_setzero_ps();
- ones = _mm256_set1_epi32(1);
- twos = _mm256_set1_epi32(2);
- fours = _mm256_set1_epi32(4);
-
- cp1 = _mm256_set1_ps(1.0);
- cp2 = _mm256_set1_ps(0.83333333e-1);
- cp3 = _mm256_set1_ps(0.2777778e-2);
- cp4 = _mm256_set1_ps(0.49603e-4);
- cp5 = _mm256_set1_ps(0.551e-6);
-
- for(;number < eighthPoints; number++){
- aVal = _mm256_load_ps(aPtr);
- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
- r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
-
- s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
- s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
-
- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
- s = _mm256_mul_ps(s, s);
- // Evaluate Taylor series
- s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
- for(i = 0; i < 3; i++){
- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int eighthPoints = num_points / 8;
+ unsigned int i = 0;
+
+ __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+ fzeroes;
+ __m256 sine, cosine, tangent, condition1, condition2, condition3;
+ __m256i q, r, ones, twos, fours;
+
+ m4pi = _mm256_set1_ps(1.273239545);
+ pio4A = _mm256_set1_ps(0.78515625);
+ pio4B = _mm256_set1_ps(0.241876e-3);
+ ffours = _mm256_set1_ps(4.0);
+ ftwos = _mm256_set1_ps(2.0);
+ fones = _mm256_set1_ps(1.0);
+ fzeroes = _mm256_setzero_ps();
+ ones = _mm256_set1_epi32(1);
+ twos = _mm256_set1_epi32(2);
+ fours = _mm256_set1_epi32(4);
+
+ cp1 = _mm256_set1_ps(1.0);
+ cp2 = _mm256_set1_ps(0.83333333e-1);
+ cp3 = _mm256_set1_ps(0.2777778e-2);
+ cp4 = _mm256_set1_ps(0.49603e-4);
+ cp5 = _mm256_set1_ps(0.551e-6);
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_load_ps(aPtr);
+ s = _mm256_sub_ps(aVal,
+ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+ r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+
+ s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
+ s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
+
+ s = _mm256_div_ps(
+ s,
+ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+ s = _mm256_mul_ps(s, s);
+ // Evaluate Taylor series
+ s = _mm256_mul_ps(
+ _mm256_add_ps(
+ _mm256_mul_ps(
+ _mm256_sub_ps(
+ _mm256_mul_ps(
+ _mm256_add_ps(
+ _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
+ s),
+ cp3),
+ s),
+ cp2),
+ s),
+ cp1),
+ s);
+
+ for (i = 0; i < 3; i++) {
+ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+ }
+ s = _mm256_div_ps(s, ftwos);
+
+ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+ cosine = _mm256_sub_ps(fones, s);
+
+ condition1 = _mm256_cmp_ps(
+ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
+ fzeroes,
+ _CMP_NEQ_UQ);
+ condition2 = _mm256_cmp_ps(
+ _mm256_cmp_ps(
+ _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
+ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
+ _CMP_NEQ_UQ);
+ condition3 = _mm256_cmp_ps(
+ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
+ fzeroes,
+ _CMP_NEQ_UQ);
+
+ __m256 temp = cosine;
+ cosine =
+ _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
+ sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
+ sine = _mm256_sub_ps(
+ sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+ cosine = _mm256_sub_ps(
+ cosine,
+ _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
+ tangent = _mm256_div_ps(sine, cosine);
+ _mm256_store_ps(bPtr, tangent);
+ aPtr += 8;
+ bPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = tan(*aPtr++);
}
- s = _mm256_div_ps(s, ftwos);
-
- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
- cosine = _mm256_sub_ps(fones, s);
-
- condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ);
- condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ);
- condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes, _CMP_NEQ_UQ);
-
- __m256 temp = cosine;
- cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
- sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
- sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
- cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
- tangent = _mm256_div_ps(sine, cosine);
- _mm256_store_ps(bPtr, tangent);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = tan(*aPtr++);
- }
}
#endif /* LV_HAVE_AVX2 for aligned */
#include <smmintrin.h>
static inline void
-volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector,
- unsigned int num_points)
+volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int quarterPoints = num_points / 4;
- unsigned int i = 0;
-
- __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
- __m128 sine, cosine, tangent, condition1, condition2, condition3;
- __m128i q, r, ones, twos, fours;
-
- m4pi = _mm_set1_ps(1.273239545);
- pio4A = _mm_set1_ps(0.78515625);
- pio4B = _mm_set1_ps(0.241876e-3);
- ffours = _mm_set1_ps(4.0);
- ftwos = _mm_set1_ps(2.0);
- fones = _mm_set1_ps(1.0);
- fzeroes = _mm_setzero_ps();
- ones = _mm_set1_epi32(1);
- twos = _mm_set1_epi32(2);
- fours = _mm_set1_epi32(4);
-
- cp1 = _mm_set1_ps(1.0);
- cp2 = _mm_set1_ps(0.83333333e-1);
- cp3 = _mm_set1_ps(0.2777778e-2);
- cp4 = _mm_set1_ps(0.49603e-4);
- cp5 = _mm_set1_ps(0.551e-6);
-
- for(;number < quarterPoints; number++){
- aVal = _mm_load_ps(aPtr);
- s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
- q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
- r = _mm_add_epi32(q, _mm_and_si128(q, ones));
-
- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
-
- s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
- s = _mm_mul_ps(s, s);
- // Evaluate Taylor series
- s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
- for(i = 0; i < 3; i++){
- s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ unsigned int i = 0;
+
+ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+ fzeroes;
+ __m128 sine, cosine, tangent, condition1, condition2, condition3;
+ __m128i q, r, ones, twos, fours;
+
+ m4pi = _mm_set1_ps(1.273239545);
+ pio4A = _mm_set1_ps(0.78515625);
+ pio4B = _mm_set1_ps(0.241876e-3);
+ ffours = _mm_set1_ps(4.0);
+ ftwos = _mm_set1_ps(2.0);
+ fones = _mm_set1_ps(1.0);
+ fzeroes = _mm_setzero_ps();
+ ones = _mm_set1_epi32(1);
+ twos = _mm_set1_epi32(2);
+ fours = _mm_set1_epi32(4);
+
+ cp1 = _mm_set1_ps(1.0);
+ cp2 = _mm_set1_ps(0.83333333e-1);
+ cp3 = _mm_set1_ps(0.2777778e-2);
+ cp4 = _mm_set1_ps(0.49603e-4);
+ cp5 = _mm_set1_ps(0.551e-6);
+
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr);
+ s = _mm_sub_ps(aVal,
+ _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+ q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+ r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+ s = _mm_div_ps(
+ s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+ s = _mm_mul_ps(s, s);
+ // Evaluate Taylor series
+ s = _mm_mul_ps(
+ _mm_add_ps(
+ _mm_mul_ps(
+ _mm_sub_ps(
+ _mm_mul_ps(
+ _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
+ cp3),
+ s),
+ cp2),
+ s),
+ cp1),
+ s);
+
+ for (i = 0; i < 3; i++) {
+ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+ }
+ s = _mm_div_ps(s, ftwos);
+
+ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+ cosine = _mm_sub_ps(fones, s);
+
+ condition1 = _mm_cmpneq_ps(
+ _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+ condition2 = _mm_cmpneq_ps(
+ _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
+ _mm_cmplt_ps(aVal, fzeroes));
+ condition3 = _mm_cmpneq_ps(
+ _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
+
+ __m128 temp = cosine;
+ cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
+ sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
+ sine =
+ _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
+ cosine = _mm_sub_ps(
+ cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
+ tangent = _mm_div_ps(sine, cosine);
+ _mm_store_ps(bPtr, tangent);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *bPtr++ = tanf(*aPtr++);
}
- s = _mm_div_ps(s, ftwos);
-
- sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
- cosine = _mm_sub_ps(fones, s);
-
- condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
- condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
- condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
-
- __m128 temp = cosine;
- cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
- sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
- sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
- cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
- tangent = _mm_div_ps(sine, cosine);
- _mm_store_ps(bPtr, tangent);
- aPtr += 4;
- bPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = tanf(*aPtr++);
- }
}
#endif /* LV_HAVE_SSE4_1 for aligned */
#include <immintrin.h>
static inline void
-volk_32f_tan_32f_u_avx2_fma(float* bVector, const float* aVector,
- unsigned int num_points)
+volk_32f_tan_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int eighthPoints = num_points / 8;
- unsigned int i = 0;
-
- __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
- __m256 sine, cosine, tangent, condition1, condition2, condition3;
- __m256i q, r, ones, twos, fours;
-
- m4pi = _mm256_set1_ps(1.273239545);
- pio4A = _mm256_set1_ps(0.78515625);
- pio4B = _mm256_set1_ps(0.241876e-3);
- ffours = _mm256_set1_ps(4.0);
- ftwos = _mm256_set1_ps(2.0);
- fones = _mm256_set1_ps(1.0);
- fzeroes = _mm256_setzero_ps();
- ones = _mm256_set1_epi32(1);
- twos = _mm256_set1_epi32(2);
- fours = _mm256_set1_epi32(4);
-
- cp1 = _mm256_set1_ps(1.0);
- cp2 = _mm256_set1_ps(0.83333333e-1);
- cp3 = _mm256_set1_ps(0.2777778e-2);
- cp4 = _mm256_set1_ps(0.49603e-4);
- cp5 = _mm256_set1_ps(0.551e-6);
-
- for(;number < eighthPoints; number++){
- aVal = _mm256_loadu_ps(aPtr);
- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
- r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
-
- s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
- s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
-
- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
- s = _mm256_mul_ps(s, s);
- // Evaluate Taylor series
- s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
-
- for(i = 0; i < 3; i++){
- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int eighthPoints = num_points / 8;
+ unsigned int i = 0;
+
+ __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+ fzeroes;
+ __m256 sine, cosine, tangent, condition1, condition2, condition3;
+ __m256i q, r, ones, twos, fours;
+
+ m4pi = _mm256_set1_ps(1.273239545);
+ pio4A = _mm256_set1_ps(0.78515625);
+ pio4B = _mm256_set1_ps(0.241876e-3);
+ ffours = _mm256_set1_ps(4.0);
+ ftwos = _mm256_set1_ps(2.0);
+ fones = _mm256_set1_ps(1.0);
+ fzeroes = _mm256_setzero_ps();
+ ones = _mm256_set1_epi32(1);
+ twos = _mm256_set1_epi32(2);
+ fours = _mm256_set1_epi32(4);
+
+ cp1 = _mm256_set1_ps(1.0);
+ cp2 = _mm256_set1_ps(0.83333333e-1);
+ cp3 = _mm256_set1_ps(0.2777778e-2);
+ cp4 = _mm256_set1_ps(0.49603e-4);
+ cp5 = _mm256_set1_ps(0.551e-6);
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_loadu_ps(aPtr);
+ s = _mm256_sub_ps(aVal,
+ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+ r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+
+ s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
+ s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
+
+ s = _mm256_div_ps(
+ s,
+ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+ s = _mm256_mul_ps(s, s);
+ // Evaluate Taylor series
+ s = _mm256_mul_ps(
+ _mm256_fmadd_ps(
+ _mm256_fmsub_ps(
+ _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
+ s,
+ cp1),
+ s);
+
+ for (i = 0; i < 3; i++) {
+ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+ }
+ s = _mm256_div_ps(s, ftwos);
+
+ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+ cosine = _mm256_sub_ps(fones, s);
+
+ condition1 = _mm256_cmp_ps(
+ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
+ fzeroes,
+ _CMP_NEQ_UQ);
+ condition2 = _mm256_cmp_ps(
+ _mm256_cmp_ps(
+ _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
+ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
+ _CMP_NEQ_UQ);
+ condition3 = _mm256_cmp_ps(
+ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
+ fzeroes,
+ _CMP_NEQ_UQ);
+
+ __m256 temp = cosine;
+ cosine =
+ _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
+ sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
+ sine = _mm256_sub_ps(
+ sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+ cosine = _mm256_sub_ps(
+ cosine,
+ _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
+ tangent = _mm256_div_ps(sine, cosine);
+ _mm256_storeu_ps(bPtr, tangent);
+ aPtr += 8;
+ bPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = tan(*aPtr++);
}
- s = _mm256_div_ps(s, ftwos);
-
- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
- cosine = _mm256_sub_ps(fones, s);
-
- condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ);
- condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ);
- condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes, _CMP_NEQ_UQ);
-
- __m256 temp = cosine;
- cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
- sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
- sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
- cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
- tangent = _mm256_div_ps(sine, cosine);
- _mm256_storeu_ps(bPtr, tangent);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = tan(*aPtr++);
- }
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
#include <immintrin.h>
static inline void
-volk_32f_tan_32f_u_avx2(float* bVector, const float* aVector,
- unsigned int num_points)
+volk_32f_tan_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int eighthPoints = num_points / 8;
- unsigned int i = 0;
-
- __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
- __m256 sine, cosine, tangent, condition1, condition2, condition3;
- __m256i q, r, ones, twos, fours;
-
- m4pi = _mm256_set1_ps(1.273239545);
- pio4A = _mm256_set1_ps(0.78515625);
- pio4B = _mm256_set1_ps(0.241876e-3);
- ffours = _mm256_set1_ps(4.0);
- ftwos = _mm256_set1_ps(2.0);
- fones = _mm256_set1_ps(1.0);
- fzeroes = _mm256_setzero_ps();
- ones = _mm256_set1_epi32(1);
- twos = _mm256_set1_epi32(2);
- fours = _mm256_set1_epi32(4);
-
- cp1 = _mm256_set1_ps(1.0);
- cp2 = _mm256_set1_ps(0.83333333e-1);
- cp3 = _mm256_set1_ps(0.2777778e-2);
- cp4 = _mm256_set1_ps(0.49603e-4);
- cp5 = _mm256_set1_ps(0.551e-6);
-
- for(;number < eighthPoints; number++){
- aVal = _mm256_loadu_ps(aPtr);
- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
- r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
-
- s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
- s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
-
- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
- s = _mm256_mul_ps(s, s);
- // Evaluate Taylor series
- s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
- for(i = 0; i < 3; i++){
- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int eighthPoints = num_points / 8;
+ unsigned int i = 0;
+
+ __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+ fzeroes;
+ __m256 sine, cosine, tangent, condition1, condition2, condition3;
+ __m256i q, r, ones, twos, fours;
+
+ m4pi = _mm256_set1_ps(1.273239545);
+ pio4A = _mm256_set1_ps(0.78515625);
+ pio4B = _mm256_set1_ps(0.241876e-3);
+ ffours = _mm256_set1_ps(4.0);
+ ftwos = _mm256_set1_ps(2.0);
+ fones = _mm256_set1_ps(1.0);
+ fzeroes = _mm256_setzero_ps();
+ ones = _mm256_set1_epi32(1);
+ twos = _mm256_set1_epi32(2);
+ fours = _mm256_set1_epi32(4);
+
+ cp1 = _mm256_set1_ps(1.0);
+ cp2 = _mm256_set1_ps(0.83333333e-1);
+ cp3 = _mm256_set1_ps(0.2777778e-2);
+ cp4 = _mm256_set1_ps(0.49603e-4);
+ cp5 = _mm256_set1_ps(0.551e-6);
+
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_loadu_ps(aPtr);
+ s = _mm256_sub_ps(aVal,
+ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+ r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+
+ s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
+ s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
+
+ s = _mm256_div_ps(
+ s,
+ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+ s = _mm256_mul_ps(s, s);
+ // Evaluate Taylor series
+ s = _mm256_mul_ps(
+ _mm256_add_ps(
+ _mm256_mul_ps(
+ _mm256_sub_ps(
+ _mm256_mul_ps(
+ _mm256_add_ps(
+ _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
+ s),
+ cp3),
+ s),
+ cp2),
+ s),
+ cp1),
+ s);
+
+ for (i = 0; i < 3; i++) {
+ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+ }
+ s = _mm256_div_ps(s, ftwos);
+
+ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+ cosine = _mm256_sub_ps(fones, s);
+
+ condition1 = _mm256_cmp_ps(
+ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
+ fzeroes,
+ _CMP_NEQ_UQ);
+ condition2 = _mm256_cmp_ps(
+ _mm256_cmp_ps(
+ _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
+ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
+ _CMP_NEQ_UQ);
+ condition3 = _mm256_cmp_ps(
+ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
+ fzeroes,
+ _CMP_NEQ_UQ);
+
+ __m256 temp = cosine;
+ cosine =
+ _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
+ sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
+ sine = _mm256_sub_ps(
+ sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+ cosine = _mm256_sub_ps(
+ cosine,
+ _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
+ tangent = _mm256_div_ps(sine, cosine);
+ _mm256_storeu_ps(bPtr, tangent);
+ aPtr += 8;
+ bPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *bPtr++ = tan(*aPtr++);
}
- s = _mm256_div_ps(s, ftwos);
-
- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
- cosine = _mm256_sub_ps(fones, s);
-
- condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ);
- condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ);
- condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes, _CMP_NEQ_UQ);
-
- __m256 temp = cosine;
- cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
- sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
- sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
- cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
- tangent = _mm256_div_ps(sine, cosine);
- _mm256_storeu_ps(bPtr, tangent);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = tan(*aPtr++);
- }
}
#endif /* LV_HAVE_AVX2 for unaligned */
static inline void
volk_32f_tan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int quarterPoints = num_points / 4;
- unsigned int i = 0;
-
- __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
- __m128 sine, cosine, tangent, condition1, condition2, condition3;
- __m128i q, r, ones, twos, fours;
-
- m4pi = _mm_set1_ps(1.273239545);
- pio4A = _mm_set1_ps(0.78515625);
- pio4B = _mm_set1_ps(0.241876e-3);
- ffours = _mm_set1_ps(4.0);
- ftwos = _mm_set1_ps(2.0);
- fones = _mm_set1_ps(1.0);
- fzeroes = _mm_setzero_ps();
- ones = _mm_set1_epi32(1);
- twos = _mm_set1_epi32(2);
- fours = _mm_set1_epi32(4);
-
- cp1 = _mm_set1_ps(1.0);
- cp2 = _mm_set1_ps(0.83333333e-1);
- cp3 = _mm_set1_ps(0.2777778e-2);
- cp4 = _mm_set1_ps(0.49603e-4);
- cp5 = _mm_set1_ps(0.551e-6);
-
- for(;number < quarterPoints; number++){
- aVal = _mm_loadu_ps(aPtr);
- s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
- q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
- r = _mm_add_epi32(q, _mm_and_si128(q, ones));
-
- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
-
- s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
- s = _mm_mul_ps(s, s);
- // Evaluate Taylor series
- s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
- for(i = 0; i < 3; i++){
- s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ unsigned int i = 0;
+
+ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+ fzeroes;
+ __m128 sine, cosine, tangent, condition1, condition2, condition3;
+ __m128i q, r, ones, twos, fours;
+
+ m4pi = _mm_set1_ps(1.273239545);
+ pio4A = _mm_set1_ps(0.78515625);
+ pio4B = _mm_set1_ps(0.241876e-3);
+ ffours = _mm_set1_ps(4.0);
+ ftwos = _mm_set1_ps(2.0);
+ fones = _mm_set1_ps(1.0);
+ fzeroes = _mm_setzero_ps();
+ ones = _mm_set1_epi32(1);
+ twos = _mm_set1_epi32(2);
+ fours = _mm_set1_epi32(4);
+
+ cp1 = _mm_set1_ps(1.0);
+ cp2 = _mm_set1_ps(0.83333333e-1);
+ cp3 = _mm_set1_ps(0.2777778e-2);
+ cp4 = _mm_set1_ps(0.49603e-4);
+ cp5 = _mm_set1_ps(0.551e-6);
+
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_loadu_ps(aPtr);
+ s = _mm_sub_ps(aVal,
+ _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+ q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+ r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+ s = _mm_div_ps(
+ s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+ s = _mm_mul_ps(s, s);
+ // Evaluate Taylor series
+ s = _mm_mul_ps(
+ _mm_add_ps(
+ _mm_mul_ps(
+ _mm_sub_ps(
+ _mm_mul_ps(
+ _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
+ cp3),
+ s),
+ cp2),
+ s),
+ cp1),
+ s);
+
+ for (i = 0; i < 3; i++) {
+ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+ }
+ s = _mm_div_ps(s, ftwos);
+
+ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+ cosine = _mm_sub_ps(fones, s);
+
+ condition1 = _mm_cmpneq_ps(
+ _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+ condition2 = _mm_cmpneq_ps(
+ _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
+ _mm_cmplt_ps(aVal, fzeroes));
+ condition3 = _mm_cmpneq_ps(
+ _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
+
+ __m128 temp = cosine;
+ cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
+ sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
+ sine =
+ _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
+ cosine = _mm_sub_ps(
+ cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
+ tangent = _mm_div_ps(sine, cosine);
+ _mm_storeu_ps(bPtr, tangent);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *bPtr++ = tanf(*aPtr++);
}
- s = _mm_div_ps(s, ftwos);
-
- sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
- cosine = _mm_sub_ps(fones, s);
-
- condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
- condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
- condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
-
- __m128 temp = cosine;
- cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
- sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
- sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
- cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
- tangent = _mm_div_ps(sine, cosine);
- _mm_storeu_ps(bPtr, tangent);
- aPtr += 4;
- bPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = tanf(*aPtr++);
- }
}
#endif /* LV_HAVE_SSE4_1 for unaligned */
#ifdef LV_HAVE_GENERIC
static inline void
-volk_32f_tan_32f_generic(float* bVector, const float* aVector,
- unsigned int num_points)
+volk_32f_tan_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
{
- float* bPtr = bVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
- for(; number < num_points; number++){
- *bPtr++ = tanf(*aPtr++);
- }
+ for (; number < num_points; number++) {
+ *bPtr++ = tanf(*aPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#include <volk/volk_neon_intrinsics.h>
static inline void
-volk_32f_tan_32f_neon(float* bVector, const float* aVector,
- unsigned int num_points)
+volk_32f_tan_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
{
unsigned int number = 0;
unsigned int quarter_points = num_points / 4;
float* bVectorPtr = bVector;
const float* aVectorPtr = aVector;
-
+
float32x4_t b_vec;
float32x4_t a_vec;
-
- for(number = 0; number < quarter_points; number++) {
+
+ for (number = 0; number < quarter_points; number++) {
a_vec = vld1q_f32(aVectorPtr);
// Prefetch next one, speeds things up
- __VOLK_PREFETCH(aVectorPtr+4);
+ __VOLK_PREFETCH(aVectorPtr + 4);
b_vec = _vtanq_f32(a_vec);
vst1q_f32(bVectorPtr, b_vec);
// move pointers ahead
- bVectorPtr+=4;
- aVectorPtr+=4;
+ bVectorPtr += 4;
+ aVectorPtr += 4;
}
-
+
// Deal with the rest
- for(number = quarter_points * 4; number < num_points; number++) {
+ for (number = quarter_points * 4; number < num_points; number++) {
*bVectorPtr++ = tanf(*aVectorPtr++);
}
}
#define INCLUDED_volk_32f_tanh_32f_a_H
#include <inttypes.h>
-#include <stdio.h>
#include <math.h>
+#include <stdio.h>
#include <string.h>
#ifdef LV_HAVE_GENERIC
static inline void
-volk_32f_tanh_32f_generic(float* cVector, const float* aVector,
- unsigned int num_points)
+volk_32f_tanh_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
{
- unsigned int number = 0;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- for(; number < num_points; number++) {
- *cPtr++ = tanhf(*aPtr++);
- }
+ unsigned int number = 0;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ for (; number < num_points; number++) {
+ *cPtr++ = tanhf(*aPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_GENERIC
static inline void
-volk_32f_tanh_32f_series(float* cVector, const float* aVector,
- unsigned int num_points)
+volk_32f_tanh_32f_series(float* cVector, const float* aVector, unsigned int num_points)
{
- unsigned int number = 0;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- for(; number < num_points; number++) {
- if(*aPtr > 4.97)
- *cPtr++ = 1;
- else if(*aPtr <= -4.97)
- *cPtr++ = -1;
- else {
- float x2 = (*aPtr) * (*aPtr);
- float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
- float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
- *cPtr++ = a / b;
- aPtr++;
+ unsigned int number = 0;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ for (; number < num_points; number++) {
+ if (*aPtr > 4.97)
+ *cPtr++ = 1;
+ else if (*aPtr <= -4.97)
+ *cPtr++ = -1;
+ else {
+ float x2 = (*aPtr) * (*aPtr);
+ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+ *cPtr++ = a / b;
+ aPtr++;
+ }
}
- }
}
#endif /* LV_HAVE_GENERIC */
-
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
static inline void
-volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector,
- unsigned int num_points)
+volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- float* cPtr = cVector;
- const float* aPtr = aVector;
-
- __m128 aVal, cVal, x2, a, b;
- __m128 const1, const2, const3, const4, const5, const6;
- const1 = _mm_set_ps1(135135.0f);
- const2 = _mm_set_ps1(17325.0f);
- const3 = _mm_set_ps1(378.0f);
- const4 = _mm_set_ps1(62370.0f);
- const5 = _mm_set_ps1(3150.0f);
- const6 = _mm_set_ps1(28.0f);
- for(;number < quarterPoints; number++){
-
- aVal = _mm_load_ps(aPtr);
- x2 = _mm_mul_ps(aVal, aVal);
- a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
- b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
-
- cVal = _mm_div_ps(a, b);
-
- _mm_store_ps(cPtr, cVal); // Store the results back into the C container
-
- aPtr += 4;
- cPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++) {
- if(*aPtr > 4.97)
- *cPtr++ = 1;
- else if(*aPtr <= -4.97)
- *cPtr++ = -1;
- else {
- float x2 = (*aPtr) * (*aPtr);
- float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
- float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
- *cPtr++ = a / b;
- aPtr++;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m128 aVal, cVal, x2, a, b;
+ __m128 const1, const2, const3, const4, const5, const6;
+ const1 = _mm_set_ps1(135135.0f);
+ const2 = _mm_set_ps1(17325.0f);
+ const3 = _mm_set_ps1(378.0f);
+ const4 = _mm_set_ps1(62370.0f);
+ const5 = _mm_set_ps1(3150.0f);
+ const6 = _mm_set_ps1(28.0f);
+ for (; number < quarterPoints; number++) {
+
+ aVal = _mm_load_ps(aPtr);
+ x2 = _mm_mul_ps(aVal, aVal);
+ a = _mm_mul_ps(
+ aVal,
+ _mm_add_ps(
+ const1,
+ _mm_mul_ps(x2,
+ _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
+ b = _mm_add_ps(
+ const1,
+ _mm_mul_ps(
+ x2,
+ _mm_add_ps(const4,
+ _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
+
+ cVal = _mm_div_ps(a, b);
+
+ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ if (*aPtr > 4.97)
+ *cPtr++ = 1;
+ else if (*aPtr <= -4.97)
+ *cPtr++ = -1;
+ else {
+ float x2 = (*aPtr) * (*aPtr);
+ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+ *cPtr++ = a / b;
+ aPtr++;
+ }
}
- }
}
#endif /* LV_HAVE_SSE */
#include <immintrin.h>
static inline void
-volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector,
- unsigned int num_points)
+volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- float* cPtr = cVector;
- const float* aPtr = aVector;
-
- __m256 aVal, cVal, x2, a, b;
- __m256 const1, const2, const3, const4, const5, const6;
- const1 = _mm256_set1_ps(135135.0f);
- const2 = _mm256_set1_ps(17325.0f);
- const3 = _mm256_set1_ps(378.0f);
- const4 = _mm256_set1_ps(62370.0f);
- const5 = _mm256_set1_ps(3150.0f);
- const6 = _mm256_set1_ps(28.0f);
- for(;number < eighthPoints; number++){
-
- aVal = _mm256_load_ps(aPtr);
- x2 = _mm256_mul_ps(aVal, aVal);
- a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
- b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
-
- cVal = _mm256_div_ps(a, b);
-
- _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
-
- aPtr += 8;
- cPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++) {
- if(*aPtr > 4.97)
- *cPtr++ = 1;
- else if(*aPtr <= -4.97)
- *cPtr++ = -1;
- else {
- float x2 = (*aPtr) * (*aPtr);
- float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
- float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
- *cPtr++ = a / b;
- aPtr++;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m256 aVal, cVal, x2, a, b;
+ __m256 const1, const2, const3, const4, const5, const6;
+ const1 = _mm256_set1_ps(135135.0f);
+ const2 = _mm256_set1_ps(17325.0f);
+ const3 = _mm256_set1_ps(378.0f);
+ const4 = _mm256_set1_ps(62370.0f);
+ const5 = _mm256_set1_ps(3150.0f);
+ const6 = _mm256_set1_ps(28.0f);
+ for (; number < eighthPoints; number++) {
+
+ aVal = _mm256_load_ps(aPtr);
+ x2 = _mm256_mul_ps(aVal, aVal);
+ a = _mm256_mul_ps(
+ aVal,
+ _mm256_add_ps(
+ const1,
+ _mm256_mul_ps(
+ x2,
+ _mm256_add_ps(const2,
+ _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
+ b = _mm256_add_ps(
+ const1,
+ _mm256_mul_ps(
+ x2,
+ _mm256_add_ps(
+ const4,
+ _mm256_mul_ps(x2,
+ _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
+
+ cVal = _mm256_div_ps(a, b);
+
+ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ if (*aPtr > 4.97)
+ *cPtr++ = 1;
+ else if (*aPtr <= -4.97)
+ *cPtr++ = -1;
+ else {
+ float x2 = (*aPtr) * (*aPtr);
+ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+ *cPtr++ = a / b;
+ aPtr++;
+ }
}
- }
}
#endif /* LV_HAVE_AVX */
#include <immintrin.h>
static inline void
-volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector,
- unsigned int num_points)
+volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- float* cPtr = cVector;
- const float* aPtr = aVector;
-
- __m256 aVal, cVal, x2, a, b;
- __m256 const1, const2, const3, const4, const5, const6;
- const1 = _mm256_set1_ps(135135.0f);
- const2 = _mm256_set1_ps(17325.0f);
- const3 = _mm256_set1_ps(378.0f);
- const4 = _mm256_set1_ps(62370.0f);
- const5 = _mm256_set1_ps(3150.0f);
- const6 = _mm256_set1_ps(28.0f);
- for(;number < eighthPoints; number++){
-
- aVal = _mm256_load_ps(aPtr);
- x2 = _mm256_mul_ps(aVal, aVal);
- a = _mm256_mul_ps(aVal, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2),const1));
- b = _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
-
- cVal = _mm256_div_ps(a, b);
-
- _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
-
- aPtr += 8;
- cPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++) {
- if(*aPtr > 4.97)
- *cPtr++ = 1;
- else if(*aPtr <= -4.97)
- *cPtr++ = -1;
- else {
- float x2 = (*aPtr) * (*aPtr);
- float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
- float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
- *cPtr++ = a / b;
- aPtr++;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m256 aVal, cVal, x2, a, b;
+ __m256 const1, const2, const3, const4, const5, const6;
+ const1 = _mm256_set1_ps(135135.0f);
+ const2 = _mm256_set1_ps(17325.0f);
+ const3 = _mm256_set1_ps(378.0f);
+ const4 = _mm256_set1_ps(62370.0f);
+ const5 = _mm256_set1_ps(3150.0f);
+ const6 = _mm256_set1_ps(28.0f);
+ for (; number < eighthPoints; number++) {
+
+ aVal = _mm256_load_ps(aPtr);
+ x2 = _mm256_mul_ps(aVal, aVal);
+ a = _mm256_mul_ps(
+ aVal,
+ _mm256_fmadd_ps(
+ x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
+ b = _mm256_fmadd_ps(
+ x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
+
+ cVal = _mm256_div_ps(a, b);
+
+ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ if (*aPtr > 4.97)
+ *cPtr++ = 1;
+ else if (*aPtr <= -4.97)
+ *cPtr++ = -1;
+ else {
+ float x2 = (*aPtr) * (*aPtr);
+ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+ *cPtr++ = a / b;
+ aPtr++;
+ }
}
- }
}
#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
#define INCLUDED_volk_32f_tanh_32f_u_H
#include <inttypes.h>
-#include <stdio.h>
#include <math.h>
+#include <stdio.h>
#include <string.h>
#include <xmmintrin.h>
static inline void
-volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector,
- unsigned int num_points)
+volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector, unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- float* cPtr = cVector;
- const float* aPtr = aVector;
-
- __m128 aVal, cVal, x2, a, b;
- __m128 const1, const2, const3, const4, const5, const6;
- const1 = _mm_set_ps1(135135.0f);
- const2 = _mm_set_ps1(17325.0f);
- const3 = _mm_set_ps1(378.0f);
- const4 = _mm_set_ps1(62370.0f);
- const5 = _mm_set_ps1(3150.0f);
- const6 = _mm_set_ps1(28.0f);
- for(;number < quarterPoints; number++){
-
- aVal = _mm_loadu_ps(aPtr);
- x2 = _mm_mul_ps(aVal, aVal);
- a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
- b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
-
- cVal = _mm_div_ps(a, b);
-
- _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
-
- aPtr += 4;
- cPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++) {
- if(*aPtr > 4.97)
- *cPtr++ = 1;
- else if(*aPtr <= -4.97)
- *cPtr++ = -1;
- else {
- float x2 = (*aPtr) * (*aPtr);
- float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
- float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
- *cPtr++ = a / b;
- aPtr++;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m128 aVal, cVal, x2, a, b;
+ __m128 const1, const2, const3, const4, const5, const6;
+ const1 = _mm_set_ps1(135135.0f);
+ const2 = _mm_set_ps1(17325.0f);
+ const3 = _mm_set_ps1(378.0f);
+ const4 = _mm_set_ps1(62370.0f);
+ const5 = _mm_set_ps1(3150.0f);
+ const6 = _mm_set_ps1(28.0f);
+ for (; number < quarterPoints; number++) {
+
+ aVal = _mm_loadu_ps(aPtr);
+ x2 = _mm_mul_ps(aVal, aVal);
+ a = _mm_mul_ps(
+ aVal,
+ _mm_add_ps(
+ const1,
+ _mm_mul_ps(x2,
+ _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
+ b = _mm_add_ps(
+ const1,
+ _mm_mul_ps(
+ x2,
+ _mm_add_ps(const4,
+ _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
+
+ cVal = _mm_div_ps(a, b);
+
+ _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ if (*aPtr > 4.97)
+ *cPtr++ = 1;
+ else if (*aPtr <= -4.97)
+ *cPtr++ = -1;
+ else {
+ float x2 = (*aPtr) * (*aPtr);
+ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+ *cPtr++ = a / b;
+ aPtr++;
+ }
}
- }
}
#endif /* LV_HAVE_SSE */
#include <immintrin.h>
static inline void
-volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector,
- unsigned int num_points)
+volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- float* cPtr = cVector;
- const float* aPtr = aVector;
-
- __m256 aVal, cVal, x2, a, b;
- __m256 const1, const2, const3, const4, const5, const6;
- const1 = _mm256_set1_ps(135135.0f);
- const2 = _mm256_set1_ps(17325.0f);
- const3 = _mm256_set1_ps(378.0f);
- const4 = _mm256_set1_ps(62370.0f);
- const5 = _mm256_set1_ps(3150.0f);
- const6 = _mm256_set1_ps(28.0f);
- for(;number < eighthPoints; number++){
-
- aVal = _mm256_loadu_ps(aPtr);
- x2 = _mm256_mul_ps(aVal, aVal);
- a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
- b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
-
- cVal = _mm256_div_ps(a, b);
-
- _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
-
- aPtr += 8;
- cPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++) {
- if(*aPtr > 4.97)
- *cPtr++ = 1;
- else if(*aPtr <= -4.97)
- *cPtr++ = -1;
- else {
- float x2 = (*aPtr) * (*aPtr);
- float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
- float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
- *cPtr++ = a / b;
- aPtr++;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m256 aVal, cVal, x2, a, b;
+ __m256 const1, const2, const3, const4, const5, const6;
+ const1 = _mm256_set1_ps(135135.0f);
+ const2 = _mm256_set1_ps(17325.0f);
+ const3 = _mm256_set1_ps(378.0f);
+ const4 = _mm256_set1_ps(62370.0f);
+ const5 = _mm256_set1_ps(3150.0f);
+ const6 = _mm256_set1_ps(28.0f);
+ for (; number < eighthPoints; number++) {
+
+ aVal = _mm256_loadu_ps(aPtr);
+ x2 = _mm256_mul_ps(aVal, aVal);
+ a = _mm256_mul_ps(
+ aVal,
+ _mm256_add_ps(
+ const1,
+ _mm256_mul_ps(
+ x2,
+ _mm256_add_ps(const2,
+ _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
+ b = _mm256_add_ps(
+ const1,
+ _mm256_mul_ps(
+ x2,
+ _mm256_add_ps(
+ const4,
+ _mm256_mul_ps(x2,
+ _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
+
+ cVal = _mm256_div_ps(a, b);
+
+ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ if (*aPtr > 4.97)
+ *cPtr++ = 1;
+ else if (*aPtr <= -4.97)
+ *cPtr++ = -1;
+ else {
+ float x2 = (*aPtr) * (*aPtr);
+ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+ *cPtr++ = a / b;
+ aPtr++;
+ }
}
- }
}
#endif /* LV_HAVE_AVX */
#include <immintrin.h>
static inline void
-volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector,
- unsigned int num_points)
+volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- float* cPtr = cVector;
- const float* aPtr = aVector;
-
- __m256 aVal, cVal, x2, a, b;
- __m256 const1, const2, const3, const4, const5, const6;
- const1 = _mm256_set1_ps(135135.0f);
- const2 = _mm256_set1_ps(17325.0f);
- const3 = _mm256_set1_ps(378.0f);
- const4 = _mm256_set1_ps(62370.0f);
- const5 = _mm256_set1_ps(3150.0f);
- const6 = _mm256_set1_ps(28.0f);
- for(;number < eighthPoints; number++){
-
- aVal = _mm256_loadu_ps(aPtr);
- x2 = _mm256_mul_ps(aVal, aVal);
- a = _mm256_mul_ps(aVal, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2),const1));
- b = _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
-
- cVal = _mm256_div_ps(a, b);
-
- _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
-
- aPtr += 8;
- cPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++) {
- if(*aPtr > 4.97)
- *cPtr++ = 1;
- else if(*aPtr <= -4.97)
- *cPtr++ = -1;
- else {
- float x2 = (*aPtr) * (*aPtr);
- float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
- float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
- *cPtr++ = a / b;
- aPtr++;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m256 aVal, cVal, x2, a, b;
+ __m256 const1, const2, const3, const4, const5, const6;
+ const1 = _mm256_set1_ps(135135.0f);
+ const2 = _mm256_set1_ps(17325.0f);
+ const3 = _mm256_set1_ps(378.0f);
+ const4 = _mm256_set1_ps(62370.0f);
+ const5 = _mm256_set1_ps(3150.0f);
+ const6 = _mm256_set1_ps(28.0f);
+ for (; number < eighthPoints; number++) {
+
+ aVal = _mm256_loadu_ps(aPtr);
+ x2 = _mm256_mul_ps(aVal, aVal);
+ a = _mm256_mul_ps(
+ aVal,
+ _mm256_fmadd_ps(
+ x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
+ b = _mm256_fmadd_ps(
+ x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
+
+ cVal = _mm256_div_ps(a, b);
+
+ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ if (*aPtr > 4.97)
+ *cPtr++ = 1;
+ else if (*aPtr <= -4.97)
+ *cPtr++ = -1;
+ else {
+ float x2 = (*aPtr) * (*aPtr);
+ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+ *cPtr++ = a / b;
+ aPtr++;
+ }
}
- }
}
#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_x2_add_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
- * \endcode
+ * void volk_32f_x2_add_32f(float* cVector, const float* aVector, const float* bVector,
+ * unsigned int num_points) \endcode
*
* \b Inputs
* \li aVector: First vector of input points.
*
* \b Example
*
- * The follow example adds the increasing and decreasing vectors such that the result of every summation pair is 10
+ * The follow example adds the increasing and decreasing vectors such that the result of
+ * every summation pair is 10
*
* \code
* int N = 10;
#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void
-volk_32f_x2_add_32f_u_avx512f(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_add_32f_u_avx512f(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m512 aVal, bVal, cVal;
- for(;number < sixteenthPoints; number++){
+ __m512 aVal, bVal, cVal;
+ for (; number < sixteenthPoints; number++) {
- aVal = _mm512_loadu_ps(aPtr);
- bVal = _mm512_loadu_ps(bPtr);
+ aVal = _mm512_loadu_ps(aPtr);
+ bVal = _mm512_loadu_ps(bPtr);
- cVal = _mm512_add_ps(aVal, bVal);
+ cVal = _mm512_add_ps(aVal, bVal);
- _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container
+ _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 16;
- bPtr += 16;
- cPtr += 16;
- }
+ aPtr += 16;
+ bPtr += 16;
+ cPtr += 16;
+ }
- number = sixteenthPoints * 16;
+ number = sixteenthPoints * 16;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_x2_add_32f_u_avx(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_add_32f_u_avx(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
- __m256 aVal, bVal, cVal;
- for(;number < eighthPoints; number++){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
+ __m256 aVal, bVal, cVal;
+ for (; number < eighthPoints; number++) {
- aVal = _mm256_loadu_ps(aPtr);
- bVal = _mm256_loadu_ps(bPtr);
+ aVal = _mm256_loadu_ps(aPtr);
+ bVal = _mm256_loadu_ps(bPtr);
- cVal = _mm256_add_ps(aVal, bVal);
+ cVal = _mm256_add_ps(aVal, bVal);
- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
- number = eighthPoints * 8;
+ number = eighthPoints * 8;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_add_32f_u_sse(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m128 aVal, bVal, cVal;
- for(;number < quarterPoints; number++){
+ __m128 aVal, bVal, cVal;
+ for (; number < quarterPoints; number++) {
- aVal = _mm_loadu_ps(aPtr);
- bVal = _mm_loadu_ps(bPtr);
+ aVal = _mm_loadu_ps(aPtr);
+ bVal = _mm_loadu_ps(bPtr);
- cVal = _mm_add_ps(aVal, bVal);
+ cVal = _mm_add_ps(aVal, bVal);
- _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+ _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_x2_add_32f_generic(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_add_32f_generic(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void
-volk_32f_x2_add_32f_a_avx512f(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_add_32f_a_avx512f(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m512 aVal, bVal, cVal;
- for(;number < sixteenthPoints; number++){
+ __m512 aVal, bVal, cVal;
+ for (; number < sixteenthPoints; number++) {
- aVal = _mm512_load_ps(aPtr);
- bVal = _mm512_load_ps(bPtr);
+ aVal = _mm512_load_ps(aPtr);
+ bVal = _mm512_load_ps(bPtr);
- cVal = _mm512_add_ps(aVal, bVal);
+ cVal = _mm512_add_ps(aVal, bVal);
- _mm512_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 16;
- bPtr += 16;
- cPtr += 16;
- }
+ aPtr += 16;
+ bPtr += 16;
+ cPtr += 16;
+ }
- number = sixteenthPoints * 16;
+ number = sixteenthPoints * 16;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_x2_add_32f_a_avx(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_add_32f_a_avx(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m256 aVal, bVal, cVal;
- for(;number < eighthPoints; number++){
+ __m256 aVal, bVal, cVal;
+ for (; number < eighthPoints; number++) {
- aVal = _mm256_load_ps(aPtr);
- bVal = _mm256_load_ps(bPtr);
+ aVal = _mm256_load_ps(aPtr);
+ bVal = _mm256_load_ps(bPtr);
- cVal = _mm256_add_ps(aVal, bVal);
+ cVal = _mm256_add_ps(aVal, bVal);
- _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_add_32f_a_sse(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m128 aVal, bVal, cVal;
- for(;number < quarterPoints; number++){
- aVal = _mm_load_ps(aPtr);
- bVal = _mm_load_ps(bPtr);
+ __m128 aVal, bVal, cVal;
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
- cVal = _mm_add_ps(aVal, bVal);
+ cVal = _mm_add_ps(aVal, bVal);
- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_32f_x2_add_32f_u_neon(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_add_32f_u_neon(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
- float32x4_t aVal, bVal, cVal;
- for(number=0; number < quarterPoints; number++){
- // Load in to NEON registers
- aVal = vld1q_f32(aPtr);
- bVal = vld1q_f32(bPtr);
- __VOLK_PREFETCH(aPtr+4);
- __VOLK_PREFETCH(bPtr+4);
-
- // vector add
- cVal = vaddq_f32(aVal, bVal);
- // Store the results back into the C container
- vst1q_f32(cPtr,cVal);
-
- aPtr += 4; // q uses quadwords, 4 floats per vadd
- bPtr += 4;
- cPtr += 4;
- }
-
- number = quarterPoints * 4; // should be = num_points
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
+ float32x4_t aVal, bVal, cVal;
+ for (number = 0; number < quarterPoints; number++) {
+ // Load in to NEON registers
+ aVal = vld1q_f32(aPtr);
+ bVal = vld1q_f32(bPtr);
+ __VOLK_PREFETCH(aPtr + 4);
+ __VOLK_PREFETCH(bPtr + 4);
+
+ // vector add
+ cVal = vaddq_f32(aVal, bVal);
+ // Store the results back into the C container
+ vst1q_f32(cPtr, cVal);
+
+ aPtr += 4; // q uses quadwords, 4 floats per vadd
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4; // should be = num_points
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_NEONV7
-extern void volk_32f_x2_add_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+extern void volk_32f_x2_add_32f_a_neonasm(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points);
#endif /* LV_HAVE_NEONV7 */
#ifdef LV_HAVE_NEONV7
-extern void volk_32f_x2_add_32f_a_neonpipeline(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+extern void volk_32f_x2_add_32f_a_neonpipeline(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points);
#endif /* LV_HAVE_NEONV7 */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_x2_add_32f_a_generic(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_add_32f_a_generic(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC
-extern void
-volk_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points);
+extern void volk_32f_x2_add_32f_a_orc_impl(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points);
-static inline void
-volk_32f_x2_add_32f_u_orc(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points){
- volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+static inline void volk_32f_x2_add_32f_u_orc(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
+{
+ volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_ORC */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_x2_divide_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
- * \endcode
+ * void volk_32f_x2_divide_32f(float* cVector, const float* aVector, const float* bVector,
+ * unsigned int num_points) \endcode
*
* \b Inputs
* \li aVector: First vector of input points.
#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void
-volk_32f_x2_divide_32f_a_avx512f(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_divide_32f_a_avx512f(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m512 aVal, bVal, cVal;
- for(;number < sixteenthPoints; number++){
- aVal = _mm512_load_ps(aPtr);
- bVal = _mm512_load_ps(bPtr);
+ __m512 aVal, bVal, cVal;
+ for (; number < sixteenthPoints; number++) {
+ aVal = _mm512_load_ps(aPtr);
+ bVal = _mm512_load_ps(bPtr);
- cVal = _mm512_div_ps(aVal, bVal);
+ cVal = _mm512_div_ps(aVal, bVal);
- _mm512_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 16;
- bPtr += 16;
- cPtr += 16;
- }
+ aPtr += 16;
+ bPtr += 16;
+ cPtr += 16;
+ }
- number = sixteenthPoints * 16;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) / (*bPtr++);
- }
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) / (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_x2_divide_32f_a_avx(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_divide_32f_a_avx(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m256 aVal, bVal, cVal;
- for(;number < eighthPoints; number++){
- aVal = _mm256_load_ps(aPtr);
- bVal = _mm256_load_ps(bPtr);
+ __m256 aVal, bVal, cVal;
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_load_ps(aPtr);
+ bVal = _mm256_load_ps(bPtr);
- cVal = _mm256_div_ps(aVal, bVal);
+ cVal = _mm256_div_ps(aVal, bVal);
- _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) / (*bPtr++);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) / (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32f_x2_divide_32f_a_sse(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_divide_32f_a_sse(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m128 aVal, bVal, cVal;
- for(;number < quarterPoints; number++){
- aVal = _mm_load_ps(aPtr);
- bVal = _mm_load_ps(bPtr);
+ __m128 aVal, bVal, cVal;
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
- cVal = _mm_div_ps(aVal, bVal);
+ cVal = _mm_div_ps(aVal, bVal);
- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) / (*bPtr++);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) / (*bPtr++);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_32f_x2_divide_32f_neon(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_divide_32f_neon(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr = bVector;
-
- float32x4x4_t aVal, bVal, bInv, cVal;
-
- const unsigned int eighthPoints = num_points / 16;
- unsigned int number = 0;
- for(; number < eighthPoints; number++){
- aVal = vld4q_f32(aPtr);
- aPtr += 16;
- bVal = vld4q_f32(bPtr);
- bPtr += 16;
-
- __VOLK_PREFETCH(aPtr+16);
- __VOLK_PREFETCH(bPtr+16);
-
- bInv.val[0] = vrecpeq_f32(bVal.val[0]);
- bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
- bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
- cVal.val[0] = vmulq_f32(aVal.val[0], bInv.val[0]);
-
- bInv.val[1] = vrecpeq_f32(bVal.val[1]);
- bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
- bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
- cVal.val[1] = vmulq_f32(aVal.val[1], bInv.val[1]);
-
- bInv.val[2] = vrecpeq_f32(bVal.val[2]);
- bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
- bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
- cVal.val[2] = vmulq_f32(aVal.val[2], bInv.val[2]);
-
- bInv.val[3] = vrecpeq_f32(bVal.val[3]);
- bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
- bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
- cVal.val[3] = vmulq_f32(aVal.val[3], bInv.val[3]);
-
- vst4q_f32(cPtr, cVal);
- cPtr += 16;
- }
-
- for(number = eighthPoints * 16; number < num_points; number++){
- *cPtr++ = (*aPtr++) / (*bPtr++);
- }
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
+
+ float32x4x4_t aVal, bVal, bInv, cVal;
+
+ const unsigned int eighthPoints = num_points / 16;
+ unsigned int number = 0;
+ for (; number < eighthPoints; number++) {
+ aVal = vld4q_f32(aPtr);
+ aPtr += 16;
+ bVal = vld4q_f32(bPtr);
+ bPtr += 16;
+
+ __VOLK_PREFETCH(aPtr + 16);
+ __VOLK_PREFETCH(bPtr + 16);
+
+ bInv.val[0] = vrecpeq_f32(bVal.val[0]);
+ bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
+ bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
+ cVal.val[0] = vmulq_f32(aVal.val[0], bInv.val[0]);
+
+ bInv.val[1] = vrecpeq_f32(bVal.val[1]);
+ bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
+ bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
+ cVal.val[1] = vmulq_f32(aVal.val[1], bInv.val[1]);
+
+ bInv.val[2] = vrecpeq_f32(bVal.val[2]);
+ bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
+ bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
+ cVal.val[2] = vmulq_f32(aVal.val[2], bInv.val[2]);
+
+ bInv.val[3] = vrecpeq_f32(bVal.val[3]);
+ bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
+ bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
+ cVal.val[3] = vmulq_f32(aVal.val[3], bInv.val[3]);
+
+ vst4q_f32(cPtr, cVal);
+ cPtr += 16;
+ }
+
+ for (number = eighthPoints * 16; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) / (*bPtr++);
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_x2_divide_32f_generic(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_divide_32f_generic(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++) / (*bPtr++);
- }
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) / (*bPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC
-extern void
-volk_32f_x2_divide_32f_a_orc_impl(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points);
+extern void volk_32f_x2_divide_32f_a_orc_impl(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points);
-static inline void
-volk_32f_x2_divide_32f_u_orc(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_divide_32f_u_orc(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+ volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_ORC */
-
#endif /* INCLUDED_volk_32f_x2_divide_32f_a_H */
#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void
-volk_32f_x2_divide_32f_u_avx512f(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_divide_32f_u_avx512f(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m512 aVal, bVal, cVal;
- for(;number < sixteenthPoints; number++){
- aVal = _mm512_loadu_ps(aPtr);
- bVal = _mm512_loadu_ps(bPtr);
+ __m512 aVal, bVal, cVal;
+ for (; number < sixteenthPoints; number++) {
+ aVal = _mm512_loadu_ps(aPtr);
+ bVal = _mm512_loadu_ps(bPtr);
- cVal = _mm512_div_ps(aVal, bVal);
+ cVal = _mm512_div_ps(aVal, bVal);
- _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container
+ _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 16;
- bPtr += 16;
- cPtr += 16;
- }
+ aPtr += 16;
+ bPtr += 16;
+ cPtr += 16;
+ }
- number = sixteenthPoints * 16;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) / (*bPtr++);
- }
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) / (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_x2_divide_32f_u_avx(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_divide_32f_u_avx(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m256 aVal, bVal, cVal;
- for(;number < eighthPoints; number++){
- aVal = _mm256_loadu_ps(aPtr);
- bVal = _mm256_loadu_ps(bPtr);
+ __m256 aVal, bVal, cVal;
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_loadu_ps(aPtr);
+ bVal = _mm256_loadu_ps(bPtr);
- cVal = _mm256_div_ps(aVal, bVal);
+ cVal = _mm256_div_ps(aVal, bVal);
- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) / (*bPtr++);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) / (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_x2_dot_prod_16i(int16_t* result, const float* input, const float* taps, unsigned int num_points)
- * \endcode
+ * void volk_32f_x2_dot_prod_16i(int16_t* result, const float* input, const float* taps,
+ * unsigned int num_points) \endcode
*
* \b Inputs
* \li input: vector of floats.
#ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H
#define INCLUDED_volk_32f_x2_dot_prod_16i_H
-#include <volk/volk_common.h>
#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_GENERIC
-static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr= taps;
- unsigned int number = 0;
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+ unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
+ for (number = 0; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
- *result = (int16_t)dotProduct;
+ *result = (int16_t)dotProduct;
}
#endif /*LV_HAVE_GENERIC*/
#ifdef LV_HAVE_SSE
-static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m128 a0Val, a1Val, a2Val, a3Val;
- __m128 b0Val, b1Val, b2Val, b3Val;
- __m128 c0Val, c1Val, c2Val, c3Val;
-
- __m128 dotProdVal0 = _mm_setzero_ps();
- __m128 dotProdVal1 = _mm_setzero_ps();
- __m128 dotProdVal2 = _mm_setzero_ps();
- __m128 dotProdVal3 = _mm_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- a0Val = _mm_load_ps(aPtr);
- a1Val = _mm_load_ps(aPtr+4);
- a2Val = _mm_load_ps(aPtr+8);
- a3Val = _mm_load_ps(aPtr+12);
- b0Val = _mm_load_ps(bPtr);
- b1Val = _mm_load_ps(bPtr+4);
- b2Val = _mm_load_ps(bPtr+8);
- b3Val = _mm_load_ps(bPtr+12);
-
- c0Val = _mm_mul_ps(a0Val, b0Val);
- c1Val = _mm_mul_ps(a1Val, b1Val);
- c2Val = _mm_mul_ps(a2Val, b2Val);
- c3Val = _mm_mul_ps(a3Val, b3Val);
-
- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
-
- aPtr += 16;
- bPtr += 16;
- }
-
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-
- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
-
- number = sixteenthPoints*16;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = (short)dotProduct;
+static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for (; number < sixteenthPoints; number++) {
+
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr + 4);
+ a2Val = _mm_load_ps(aPtr + 8);
+ a3Val = _mm_load_ps(aPtr + 12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr + 4);
+ b2Val = _mm_load_ps(bPtr + 8);
+ b3Val = _mm_load_ps(bPtr + 12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = (short)dotProduct;
}
#endif /*LV_HAVE_SSE*/
#if LV_HAVE_AVX2 && LV_HAVE_FMA
-static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int thirtysecondPoints = num_points / 32;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m256 a0Val, a1Val, a2Val, a3Val;
- __m256 b0Val, b1Val, b2Val, b3Val;
-
- __m256 dotProdVal0 = _mm256_setzero_ps();
- __m256 dotProdVal1 = _mm256_setzero_ps();
- __m256 dotProdVal2 = _mm256_setzero_ps();
- __m256 dotProdVal3 = _mm256_setzero_ps();
-
- for(;number < thirtysecondPoints; number++){
-
- a0Val = _mm256_load_ps(aPtr);
- a1Val = _mm256_load_ps(aPtr+8);
- a2Val = _mm256_load_ps(aPtr+16);
- a3Val = _mm256_load_ps(aPtr+24);
- b0Val = _mm256_load_ps(bPtr);
- b1Val = _mm256_load_ps(bPtr+8);
- b2Val = _mm256_load_ps(bPtr+16);
- b3Val = _mm256_load_ps(bPtr+24);
-
- dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
- dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
- dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
- dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
-
- aPtr += 32;
- bPtr += 32;
- }
-
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
- dotProduct += dotProductVector[4];
- dotProduct += dotProductVector[5];
- dotProduct += dotProductVector[6];
- dotProduct += dotProductVector[7];
-
- number = thirtysecondPoints*32;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = (short)dotProduct;
+static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
+
+ unsigned int number = 0;
+ const unsigned int thirtysecondPoints = num_points / 32;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val, a2Val, a3Val;
+ __m256 b0Val, b1Val, b2Val, b3Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+ __m256 dotProdVal2 = _mm256_setzero_ps();
+ __m256 dotProdVal3 = _mm256_setzero_ps();
+
+ for (; number < thirtysecondPoints; number++) {
+
+ a0Val = _mm256_load_ps(aPtr);
+ a1Val = _mm256_load_ps(aPtr + 8);
+ a2Val = _mm256_load_ps(aPtr + 16);
+ a3Val = _mm256_load_ps(aPtr + 24);
+ b0Val = _mm256_load_ps(bPtr);
+ b1Val = _mm256_load_ps(bPtr + 8);
+ b2Val = _mm256_load_ps(bPtr + 16);
+ b3Val = _mm256_load_ps(bPtr + 24);
+
+ dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
+ dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
+ dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
+ dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
+
+ aPtr += 32;
+ bPtr += 32;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_store_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+ dotProduct += dotProductVector[4];
+ dotProduct += dotProductVector[5];
+ dotProduct += dotProductVector[6];
+ dotProduct += dotProductVector[7];
+
+ number = thirtysecondPoints * 32;
+ for (; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = (short)dotProduct;
}
#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
#ifdef LV_HAVE_AVX
-static inline void volk_32f_x2_dot_prod_16i_a_avx(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int thirtysecondPoints = num_points / 32;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m256 a0Val, a1Val, a2Val, a3Val;
- __m256 b0Val, b1Val, b2Val, b3Val;
- __m256 c0Val, c1Val, c2Val, c3Val;
-
- __m256 dotProdVal0 = _mm256_setzero_ps();
- __m256 dotProdVal1 = _mm256_setzero_ps();
- __m256 dotProdVal2 = _mm256_setzero_ps();
- __m256 dotProdVal3 = _mm256_setzero_ps();
-
- for(;number < thirtysecondPoints; number++){
-
- a0Val = _mm256_load_ps(aPtr);
- a1Val = _mm256_load_ps(aPtr+8);
- a2Val = _mm256_load_ps(aPtr+16);
- a3Val = _mm256_load_ps(aPtr+24);
- b0Val = _mm256_load_ps(bPtr);
- b1Val = _mm256_load_ps(bPtr+8);
- b2Val = _mm256_load_ps(bPtr+16);
- b3Val = _mm256_load_ps(bPtr+24);
-
- c0Val = _mm256_mul_ps(a0Val, b0Val);
- c1Val = _mm256_mul_ps(a1Val, b1Val);
- c2Val = _mm256_mul_ps(a2Val, b2Val);
- c3Val = _mm256_mul_ps(a3Val, b3Val);
-
- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
- dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
- dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
-
- aPtr += 32;
- bPtr += 32;
- }
-
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
- dotProduct += dotProductVector[4];
- dotProduct += dotProductVector[5];
- dotProduct += dotProductVector[6];
- dotProduct += dotProductVector[7];
-
- number = thirtysecondPoints*32;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = (short)dotProduct;
+static inline void volk_32f_x2_dot_prod_16i_a_avx(int16_t* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
+
+ unsigned int number = 0;
+ const unsigned int thirtysecondPoints = num_points / 32;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val, a2Val, a3Val;
+ __m256 b0Val, b1Val, b2Val, b3Val;
+ __m256 c0Val, c1Val, c2Val, c3Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+ __m256 dotProdVal2 = _mm256_setzero_ps();
+ __m256 dotProdVal3 = _mm256_setzero_ps();
+
+ for (; number < thirtysecondPoints; number++) {
+
+ a0Val = _mm256_load_ps(aPtr);
+ a1Val = _mm256_load_ps(aPtr + 8);
+ a2Val = _mm256_load_ps(aPtr + 16);
+ a3Val = _mm256_load_ps(aPtr + 24);
+ b0Val = _mm256_load_ps(bPtr);
+ b1Val = _mm256_load_ps(bPtr + 8);
+ b2Val = _mm256_load_ps(bPtr + 16);
+ b3Val = _mm256_load_ps(bPtr + 24);
+
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
+ c2Val = _mm256_mul_ps(a2Val, b2Val);
+ c3Val = _mm256_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 32;
+ bPtr += 32;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_store_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+ dotProduct += dotProductVector[4];
+ dotProduct += dotProductVector[5];
+ dotProduct += dotProductVector[6];
+ dotProduct += dotProductVector[7];
+
+ number = thirtysecondPoints * 32;
+ for (; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = (short)dotProduct;
}
#endif /*LV_HAVE_AVX*/
#ifdef LV_HAVE_AVX512F
-static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int sixtyfourthPoints = num_points / 64;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m512 a0Val, a1Val, a2Val, a3Val;
- __m512 b0Val, b1Val, b2Val, b3Val;
-
- __m512 dotProdVal0 = _mm512_setzero_ps();
- __m512 dotProdVal1 = _mm512_setzero_ps();
- __m512 dotProdVal2 = _mm512_setzero_ps();
- __m512 dotProdVal3 = _mm512_setzero_ps();
-
- for(;number < sixtyfourthPoints; number++){
-
- a0Val = _mm512_load_ps(aPtr);
- a1Val = _mm512_load_ps(aPtr+16);
- a2Val = _mm512_load_ps(aPtr+32);
- a3Val = _mm512_load_ps(aPtr+48);
- b0Val = _mm512_load_ps(bPtr);
- b1Val = _mm512_load_ps(bPtr+16);
- b2Val = _mm512_load_ps(bPtr+32);
- b3Val = _mm512_load_ps(bPtr+48);
-
- dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
- dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
- dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
- dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
-
- aPtr += 64;
- bPtr += 64;
- }
-
- dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
-
- _mm512_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
- dotProduct += dotProductVector[4];
- dotProduct += dotProductVector[5];
- dotProduct += dotProductVector[6];
- dotProduct += dotProductVector[7];
- dotProduct += dotProductVector[8];
- dotProduct += dotProductVector[9];
- dotProduct += dotProductVector[10];
- dotProduct += dotProductVector[11];
- dotProduct += dotProductVector[12];
- dotProduct += dotProductVector[13];
- dotProduct += dotProductVector[14];
- dotProduct += dotProductVector[15];
-
- number = sixtyfourthPoints*64;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = (short)dotProduct;
+static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
+
+ unsigned int number = 0;
+ const unsigned int sixtyfourthPoints = num_points / 64;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m512 a0Val, a1Val, a2Val, a3Val;
+ __m512 b0Val, b1Val, b2Val, b3Val;
+
+ __m512 dotProdVal0 = _mm512_setzero_ps();
+ __m512 dotProdVal1 = _mm512_setzero_ps();
+ __m512 dotProdVal2 = _mm512_setzero_ps();
+ __m512 dotProdVal3 = _mm512_setzero_ps();
+
+ for (; number < sixtyfourthPoints; number++) {
+
+ a0Val = _mm512_load_ps(aPtr);
+ a1Val = _mm512_load_ps(aPtr + 16);
+ a2Val = _mm512_load_ps(aPtr + 32);
+ a3Val = _mm512_load_ps(aPtr + 48);
+ b0Val = _mm512_load_ps(bPtr);
+ b1Val = _mm512_load_ps(bPtr + 16);
+ b2Val = _mm512_load_ps(bPtr + 32);
+ b3Val = _mm512_load_ps(bPtr + 48);
+
+ dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
+ dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
+ dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
+ dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
+
+ aPtr += 64;
+ bPtr += 64;
+ }
+
+ dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
+
+ _mm512_store_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+ dotProduct += dotProductVector[4];
+ dotProduct += dotProductVector[5];
+ dotProduct += dotProductVector[6];
+ dotProduct += dotProductVector[7];
+ dotProduct += dotProductVector[8];
+ dotProduct += dotProductVector[9];
+ dotProduct += dotProductVector[10];
+ dotProduct += dotProductVector[11];
+ dotProduct += dotProductVector[12];
+ dotProduct += dotProductVector[13];
+ dotProduct += dotProductVector[14];
+ dotProduct += dotProductVector[15];
+
+ number = sixtyfourthPoints * 64;
+ for (; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = (short)dotProduct;
}
#endif /*LV_HAVE_AVX512F*/
#ifdef LV_HAVE_SSE
-static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m128 a0Val, a1Val, a2Val, a3Val;
- __m128 b0Val, b1Val, b2Val, b3Val;
- __m128 c0Val, c1Val, c2Val, c3Val;
-
- __m128 dotProdVal0 = _mm_setzero_ps();
- __m128 dotProdVal1 = _mm_setzero_ps();
- __m128 dotProdVal2 = _mm_setzero_ps();
- __m128 dotProdVal3 = _mm_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- a0Val = _mm_loadu_ps(aPtr);
- a1Val = _mm_loadu_ps(aPtr+4);
- a2Val = _mm_loadu_ps(aPtr+8);
- a3Val = _mm_loadu_ps(aPtr+12);
- b0Val = _mm_loadu_ps(bPtr);
- b1Val = _mm_loadu_ps(bPtr+4);
- b2Val = _mm_loadu_ps(bPtr+8);
- b3Val = _mm_loadu_ps(bPtr+12);
-
- c0Val = _mm_mul_ps(a0Val, b0Val);
- c1Val = _mm_mul_ps(a1Val, b1Val);
- c2Val = _mm_mul_ps(a2Val, b2Val);
- c3Val = _mm_mul_ps(a3Val, b3Val);
-
- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
-
- aPtr += 16;
- bPtr += 16;
- }
-
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-
- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
-
- number = sixteenthPoints*16;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = (short)dotProduct;
+static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for (; number < sixteenthPoints; number++) {
+
+ a0Val = _mm_loadu_ps(aPtr);
+ a1Val = _mm_loadu_ps(aPtr + 4);
+ a2Val = _mm_loadu_ps(aPtr + 8);
+ a3Val = _mm_loadu_ps(aPtr + 12);
+ b0Val = _mm_loadu_ps(bPtr);
+ b1Val = _mm_loadu_ps(bPtr + 4);
+ b2Val = _mm_loadu_ps(bPtr + 8);
+ b3Val = _mm_loadu_ps(bPtr + 12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = (short)dotProduct;
}
#endif /*LV_HAVE_SSE*/
#if LV_HAVE_AVX2 && LV_HAVE_FMA
-static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int thirtysecondPoints = num_points / 32;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m256 a0Val, a1Val, a2Val, a3Val;
- __m256 b0Val, b1Val, b2Val, b3Val;
-
- __m256 dotProdVal0 = _mm256_setzero_ps();
- __m256 dotProdVal1 = _mm256_setzero_ps();
- __m256 dotProdVal2 = _mm256_setzero_ps();
- __m256 dotProdVal3 = _mm256_setzero_ps();
-
- for(;number < thirtysecondPoints; number++){
-
- a0Val = _mm256_loadu_ps(aPtr);
- a1Val = _mm256_loadu_ps(aPtr+8);
- a2Val = _mm256_loadu_ps(aPtr+16);
- a3Val = _mm256_loadu_ps(aPtr+24);
- b0Val = _mm256_loadu_ps(bPtr);
- b1Val = _mm256_loadu_ps(bPtr+8);
- b2Val = _mm256_loadu_ps(bPtr+16);
- b3Val = _mm256_loadu_ps(bPtr+24);
-
- dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
- dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
- dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
- dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
-
- aPtr += 32;
- bPtr += 32;
- }
-
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
- dotProduct += dotProductVector[4];
- dotProduct += dotProductVector[5];
- dotProduct += dotProductVector[6];
- dotProduct += dotProductVector[7];
-
- number = thirtysecondPoints*32;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = (short)dotProduct;
+static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
+
+ unsigned int number = 0;
+ const unsigned int thirtysecondPoints = num_points / 32;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val, a2Val, a3Val;
+ __m256 b0Val, b1Val, b2Val, b3Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+ __m256 dotProdVal2 = _mm256_setzero_ps();
+ __m256 dotProdVal3 = _mm256_setzero_ps();
+
+ for (; number < thirtysecondPoints; number++) {
+
+ a0Val = _mm256_loadu_ps(aPtr);
+ a1Val = _mm256_loadu_ps(aPtr + 8);
+ a2Val = _mm256_loadu_ps(aPtr + 16);
+ a3Val = _mm256_loadu_ps(aPtr + 24);
+ b0Val = _mm256_loadu_ps(bPtr);
+ b1Val = _mm256_loadu_ps(bPtr + 8);
+ b2Val = _mm256_loadu_ps(bPtr + 16);
+ b3Val = _mm256_loadu_ps(bPtr + 24);
+
+ dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
+ dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
+ dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
+ dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
+
+ aPtr += 32;
+ bPtr += 32;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_store_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+ dotProduct += dotProductVector[4];
+ dotProduct += dotProductVector[5];
+ dotProduct += dotProductVector[6];
+ dotProduct += dotProductVector[7];
+
+ number = thirtysecondPoints * 32;
+ for (; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = (short)dotProduct;
}
#endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
#ifdef LV_HAVE_AVX
-static inline void volk_32f_x2_dot_prod_16i_u_avx(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int thirtysecondPoints = num_points / 32;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m256 a0Val, a1Val, a2Val, a3Val;
- __m256 b0Val, b1Val, b2Val, b3Val;
- __m256 c0Val, c1Val, c2Val, c3Val;
-
- __m256 dotProdVal0 = _mm256_setzero_ps();
- __m256 dotProdVal1 = _mm256_setzero_ps();
- __m256 dotProdVal2 = _mm256_setzero_ps();
- __m256 dotProdVal3 = _mm256_setzero_ps();
-
- for(;number < thirtysecondPoints; number++){
-
- a0Val = _mm256_loadu_ps(aPtr);
- a1Val = _mm256_loadu_ps(aPtr+8);
- a2Val = _mm256_loadu_ps(aPtr+16);
- a3Val = _mm256_loadu_ps(aPtr+24);
- b0Val = _mm256_loadu_ps(bPtr);
- b1Val = _mm256_loadu_ps(bPtr+8);
- b2Val = _mm256_loadu_ps(bPtr+16);
- b3Val = _mm256_loadu_ps(bPtr+24);
-
- c0Val = _mm256_mul_ps(a0Val, b0Val);
- c1Val = _mm256_mul_ps(a1Val, b1Val);
- c2Val = _mm256_mul_ps(a2Val, b2Val);
- c3Val = _mm256_mul_ps(a3Val, b3Val);
-
- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
- dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
- dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
-
- aPtr += 32;
- bPtr += 32;
- }
-
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
- dotProduct += dotProductVector[4];
- dotProduct += dotProductVector[5];
- dotProduct += dotProductVector[6];
- dotProduct += dotProductVector[7];
-
- number = thirtysecondPoints*32;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = (short)dotProduct;
+static inline void volk_32f_x2_dot_prod_16i_u_avx(int16_t* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
+
+ unsigned int number = 0;
+ const unsigned int thirtysecondPoints = num_points / 32;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val, a2Val, a3Val;
+ __m256 b0Val, b1Val, b2Val, b3Val;
+ __m256 c0Val, c1Val, c2Val, c3Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+ __m256 dotProdVal2 = _mm256_setzero_ps();
+ __m256 dotProdVal3 = _mm256_setzero_ps();
+
+ for (; number < thirtysecondPoints; number++) {
+
+ a0Val = _mm256_loadu_ps(aPtr);
+ a1Val = _mm256_loadu_ps(aPtr + 8);
+ a2Val = _mm256_loadu_ps(aPtr + 16);
+ a3Val = _mm256_loadu_ps(aPtr + 24);
+ b0Val = _mm256_loadu_ps(bPtr);
+ b1Val = _mm256_loadu_ps(bPtr + 8);
+ b2Val = _mm256_loadu_ps(bPtr + 16);
+ b3Val = _mm256_loadu_ps(bPtr + 24);
+
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
+ c2Val = _mm256_mul_ps(a2Val, b2Val);
+ c3Val = _mm256_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 32;
+ bPtr += 32;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_store_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+ dotProduct += dotProductVector[4];
+ dotProduct += dotProductVector[5];
+ dotProduct += dotProductVector[6];
+ dotProduct += dotProductVector[7];
+
+ number = thirtysecondPoints * 32;
+ for (; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = (short)dotProduct;
}
#endif /*LV_HAVE_AVX*/
#ifdef LV_HAVE_AVX512F
-static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int sixtyfourthPoints = num_points / 64;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m512 a0Val, a1Val, a2Val, a3Val;
- __m512 b0Val, b1Val, b2Val, b3Val;
-
- __m512 dotProdVal0 = _mm512_setzero_ps();
- __m512 dotProdVal1 = _mm512_setzero_ps();
- __m512 dotProdVal2 = _mm512_setzero_ps();
- __m512 dotProdVal3 = _mm512_setzero_ps();
-
- for(;number < sixtyfourthPoints; number++){
-
- a0Val = _mm512_loadu_ps(aPtr);
- a1Val = _mm512_loadu_ps(aPtr+16);
- a2Val = _mm512_loadu_ps(aPtr+32);
- a3Val = _mm512_loadu_ps(aPtr+48);
- b0Val = _mm512_loadu_ps(bPtr);
- b1Val = _mm512_loadu_ps(bPtr+16);
- b2Val = _mm512_loadu_ps(bPtr+32);
- b3Val = _mm512_loadu_ps(bPtr+48);
-
- dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
- dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
- dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
- dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
-
- aPtr += 64;
- bPtr += 64;
- }
-
- dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
-
- _mm512_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
- dotProduct += dotProductVector[4];
- dotProduct += dotProductVector[5];
- dotProduct += dotProductVector[6];
- dotProduct += dotProductVector[7];
- dotProduct += dotProductVector[8];
- dotProduct += dotProductVector[9];
- dotProduct += dotProductVector[10];
- dotProduct += dotProductVector[11];
- dotProduct += dotProductVector[12];
- dotProduct += dotProductVector[13];
- dotProduct += dotProductVector[14];
- dotProduct += dotProductVector[15];
-
- number = sixtyfourthPoints*64;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = (short)dotProduct;
+static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
+
+ unsigned int number = 0;
+ const unsigned int sixtyfourthPoints = num_points / 64;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m512 a0Val, a1Val, a2Val, a3Val;
+ __m512 b0Val, b1Val, b2Val, b3Val;
+
+ __m512 dotProdVal0 = _mm512_setzero_ps();
+ __m512 dotProdVal1 = _mm512_setzero_ps();
+ __m512 dotProdVal2 = _mm512_setzero_ps();
+ __m512 dotProdVal3 = _mm512_setzero_ps();
+
+ for (; number < sixtyfourthPoints; number++) {
+
+ a0Val = _mm512_loadu_ps(aPtr);
+ a1Val = _mm512_loadu_ps(aPtr + 16);
+ a2Val = _mm512_loadu_ps(aPtr + 32);
+ a3Val = _mm512_loadu_ps(aPtr + 48);
+ b0Val = _mm512_loadu_ps(bPtr);
+ b1Val = _mm512_loadu_ps(bPtr + 16);
+ b2Val = _mm512_loadu_ps(bPtr + 32);
+ b3Val = _mm512_loadu_ps(bPtr + 48);
+
+ dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
+ dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
+ dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
+ dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
+
+ aPtr += 64;
+ bPtr += 64;
+ }
+
+ dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
+
+ _mm512_storeu_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+ dotProduct += dotProductVector[4];
+ dotProduct += dotProductVector[5];
+ dotProduct += dotProductVector[6];
+ dotProduct += dotProductVector[7];
+ dotProduct += dotProductVector[8];
+ dotProduct += dotProductVector[9];
+ dotProduct += dotProductVector[10];
+ dotProduct += dotProductVector[11];
+ dotProduct += dotProductVector[12];
+ dotProduct += dotProductVector[13];
+ dotProduct += dotProductVector[14];
+ dotProduct += dotProductVector[15];
+
+ number = sixtyfourthPoints * 64;
+ for (; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = (short)dotProduct;
}
#endif /*LV_HAVE_AVX512F*/
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_x2_dot_prod_32f(float* result, const float* input, const float* taps, unsigned int num_points)
- * \endcode
+ * void volk_32f_x2_dot_prod_32f(float* result, const float* input, const float* taps,
+ * unsigned int num_points) \endcode
*
* \b Inputs
* \li input: vector of floats.
* \li result: pointer to a float value to hold the dot product result.
*
* \b Example
- * Take the dot product of an increasing vector and a vector of ones. The result is the sum of integers (0,9).
- * \code
- * int N = 10;
- * unsigned int alignment = volk_get_alignment();
+ * Take the dot product of an increasing vector and a vector of ones. The result is the
+ * sum of integers (0,9). \code int N = 10; unsigned int alignment = volk_get_alignment();
* float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
* float* ones = (float*)volk_malloc(sizeof(float)*N, alignment);
* float* out = (float*)volk_malloc(sizeof(float)*1, alignment);
#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
#define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
+#include <stdio.h>
#include <volk/volk_common.h>
-#include<stdio.h>
#ifdef LV_HAVE_GENERIC
-static inline void volk_32f_x2_dot_prod_32f_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_generic(float* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr= taps;
- unsigned int number = 0;
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+ unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
+ for (number = 0; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
- *result = dotProduct;
+ *result = dotProduct;
}
#endif /*LV_HAVE_GENERIC*/
#ifdef LV_HAVE_SSE
-static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
+static inline void volk_32f_x2_dot_prod_32f_u_sse(float* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
- __m128 a0Val, a1Val, a2Val, a3Val;
- __m128 b0Val, b1Val, b2Val, b3Val;
- __m128 c0Val, c1Val, c2Val, c3Val;
-
- __m128 dotProdVal0 = _mm_setzero_ps();
- __m128 dotProdVal1 = _mm_setzero_ps();
- __m128 dotProdVal2 = _mm_setzero_ps();
- __m128 dotProdVal3 = _mm_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- a0Val = _mm_loadu_ps(aPtr);
- a1Val = _mm_loadu_ps(aPtr+4);
- a2Val = _mm_loadu_ps(aPtr+8);
- a3Val = _mm_loadu_ps(aPtr+12);
- b0Val = _mm_loadu_ps(bPtr);
- b1Val = _mm_loadu_ps(bPtr+4);
- b2Val = _mm_loadu_ps(bPtr+8);
- b3Val = _mm_loadu_ps(bPtr+12);
-
- c0Val = _mm_mul_ps(a0Val, b0Val);
- c1Val = _mm_mul_ps(a1Val, b1Val);
- c2Val = _mm_mul_ps(a2Val, b2Val);
- c3Val = _mm_mul_ps(a3Val, b3Val);
-
- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- aPtr += 16;
- bPtr += 16;
- }
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for (; number < sixteenthPoints; number++) {
+
+ a0Val = _mm_loadu_ps(aPtr);
+ a1Val = _mm_loadu_ps(aPtr + 4);
+ a2Val = _mm_loadu_ps(aPtr + 8);
+ a3Val = _mm_loadu_ps(aPtr + 12);
+ b0Val = _mm_loadu_ps(bPtr);
+ b1Val = _mm_loadu_ps(bPtr + 4);
+ b2Val = _mm_loadu_ps(bPtr + 8);
+ b3Val = _mm_loadu_ps(bPtr + 12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+ aPtr += 16;
+ bPtr += 16;
+ }
- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
+ _mm_store_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
- number = sixteenthPoints*16;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
- *result = dotProduct;
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+ *result = dotProduct;
}
#endif /*LV_HAVE_SSE*/
#include <pmmintrin.h>
-static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m128 a0Val, a1Val, a2Val, a3Val;
- __m128 b0Val, b1Val, b2Val, b3Val;
- __m128 c0Val, c1Val, c2Val, c3Val;
-
- __m128 dotProdVal0 = _mm_setzero_ps();
- __m128 dotProdVal1 = _mm_setzero_ps();
- __m128 dotProdVal2 = _mm_setzero_ps();
- __m128 dotProdVal3 = _mm_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- a0Val = _mm_loadu_ps(aPtr);
- a1Val = _mm_loadu_ps(aPtr+4);
- a2Val = _mm_loadu_ps(aPtr+8);
- a3Val = _mm_loadu_ps(aPtr+12);
- b0Val = _mm_loadu_ps(bPtr);
- b1Val = _mm_loadu_ps(bPtr+4);
- b2Val = _mm_loadu_ps(bPtr+8);
- b3Val = _mm_loadu_ps(bPtr+12);
-
- c0Val = _mm_mul_ps(a0Val, b0Val);
- c1Val = _mm_mul_ps(a1Val, b1Val);
- c2Val = _mm_mul_ps(a2Val, b2Val);
- c3Val = _mm_mul_ps(a3Val, b3Val);
-
- dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
- dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
- dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
- dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
-
- aPtr += 16;
- bPtr += 16;
- }
-
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
-
- number = sixteenthPoints*16;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = dotProduct;
-}
-
-#endif /*LV_HAVE_SSE3*/
+static inline void volk_32f_x2_dot_prod_32f_u_sse3(float* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
-#ifdef LV_HAVE_SSE4_1
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for (; number < sixteenthPoints; number++) {
+
+ a0Val = _mm_loadu_ps(aPtr);
+ a1Val = _mm_loadu_ps(aPtr + 4);
+ a2Val = _mm_loadu_ps(aPtr + 8);
+ a3Val = _mm_loadu_ps(aPtr + 12);
+ b0Val = _mm_loadu_ps(bPtr);
+ b1Val = _mm_loadu_ps(bPtr + 4);
+ b2Val = _mm_loadu_ps(bPtr + 8);
+ b3Val = _mm_loadu_ps(bPtr + 12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+ dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+ dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+ dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
-#include <smmintrin.h>
+ aPtr += 16;
+ bPtr += 16;
+ }
-static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ _mm_store_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
- __m128 aVal1, bVal1, cVal1;
- __m128 aVal2, bVal2, cVal2;
- __m128 aVal3, bVal3, cVal3;
- __m128 aVal4, bVal4, cVal4;
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
- __m128 dotProdVal = _mm_setzero_ps();
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
- for(;number < sixteenthPoints; number++){
+ *result = dotProduct;
+}
- aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
- aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
- aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
- aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
+#endif /*LV_HAVE_SSE3*/
- bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
- bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
- bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
- bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
+#ifdef LV_HAVE_SSE4_1
- cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
- cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
- cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
- cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+#include <smmintrin.h>
- cVal1 = _mm_or_ps(cVal1, cVal2);
- cVal3 = _mm_or_ps(cVal3, cVal4);
- cVal1 = _mm_or_ps(cVal1, cVal3);
+static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- dotProdVal = _mm_add_ps(dotProdVal, cVal1);
- }
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 aVal1, bVal1, cVal1;
+ __m128 aVal2, bVal2, cVal2;
+ __m128 aVal3, bVal3, cVal3;
+ __m128 aVal4, bVal4, cVal4;
+
+ __m128 dotProdVal = _mm_setzero_ps();
+
+ for (; number < sixteenthPoints; number++) {
+
+ aVal1 = _mm_loadu_ps(aPtr);
+ aPtr += 4;
+ aVal2 = _mm_loadu_ps(aPtr);
+ aPtr += 4;
+ aVal3 = _mm_loadu_ps(aPtr);
+ aPtr += 4;
+ aVal4 = _mm_loadu_ps(aPtr);
+ aPtr += 4;
+
+ bVal1 = _mm_loadu_ps(bPtr);
+ bPtr += 4;
+ bVal2 = _mm_loadu_ps(bPtr);
+ bPtr += 4;
+ bVal3 = _mm_loadu_ps(bPtr);
+ bPtr += 4;
+ bVal4 = _mm_loadu_ps(bPtr);
+ bPtr += 4;
+
+ cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+ cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+ cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+ cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+ cVal1 = _mm_or_ps(cVal1, cVal2);
+ cVal3 = _mm_or_ps(cVal3, cVal4);
+ cVal1 = _mm_or_ps(cVal1, cVal3);
+
+ dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+ }
- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
- _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ _mm_store_ps(dotProductVector,
+ dotProdVal); // Store the results back into the dot product vector
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
- number = sixteenthPoints * 16;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
- *result = dotProduct;
+ *result = dotProduct;
}
#endif /*LV_HAVE_SSE4_1*/
#include <immintrin.h>
-static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_u_avx(float* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- __m256 a0Val, a1Val;
- __m256 b0Val, b1Val;
- __m256 c0Val, c1Val;
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
- __m256 dotProdVal0 = _mm256_setzero_ps();
- __m256 dotProdVal1 = _mm256_setzero_ps();
+ __m256 a0Val, a1Val;
+ __m256 b0Val, b1Val;
+ __m256 c0Val, c1Val;
- for(;number < sixteenthPoints; number++){
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
- a0Val = _mm256_loadu_ps(aPtr);
- a1Val = _mm256_loadu_ps(aPtr+8);
- b0Val = _mm256_loadu_ps(bPtr);
- b1Val = _mm256_loadu_ps(bPtr+8);
+ for (; number < sixteenthPoints; number++) {
- c0Val = _mm256_mul_ps(a0Val, b0Val);
- c1Val = _mm256_mul_ps(a1Val, b1Val);
+ a0Val = _mm256_loadu_ps(aPtr);
+ a1Val = _mm256_loadu_ps(aPtr + 8);
+ b0Val = _mm256_loadu_ps(bPtr);
+ b1Val = _mm256_loadu_ps(bPtr + 8);
- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
- aPtr += 16;
- bPtr += 16;
- }
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+ aPtr += 16;
+ bPtr += 16;
+ }
- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
- _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
- dotProduct += dotProductVector[4];
- dotProduct += dotProductVector[5];
- dotProduct += dotProductVector[6];
- dotProduct += dotProductVector[7];
+ _mm256_storeu_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
- number = sixteenthPoints*16;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+ dotProduct += dotProductVector[4];
+ dotProduct += dotProductVector[5];
+ dotProduct += dotProductVector[6];
+ dotProduct += dotProductVector[7];
- *result = dotProduct;
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+ *result = dotProduct;
}
#endif /*LV_HAVE_AVX*/
#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include <immintrin.h>
-static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(float * result, const float * input, const float* taps, unsigned int num_points){
- unsigned int number;
- const unsigned int eighthPoints = num_points / 8;
+static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(float* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
+ unsigned int number;
+ const unsigned int eighthPoints = num_points / 8;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m256 dotProdVal = _mm256_setzero_ps();
- __m256 aVal1, bVal1;
+ const float* aPtr = input;
+ const float* bPtr = taps;
- for (number = 0; number < eighthPoints; number++ ) {
+ __m256 dotProdVal = _mm256_setzero_ps();
+ __m256 aVal1, bVal1;
- aVal1 = _mm256_loadu_ps(aPtr);
- bVal1 = _mm256_loadu_ps(bPtr);
- aPtr += 8;
- bPtr += 8;
+ for (number = 0; number < eighthPoints; number++) {
- dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
- }
+ aVal1 = _mm256_loadu_ps(aPtr);
+ bVal1 = _mm256_loadu_ps(bPtr);
+ aPtr += 8;
+ bPtr += 8;
- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
- _mm256_storeu_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
- _mm256_zeroupper();
+ dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
+ }
- float dotProduct =
- dotProductVector[0] + dotProductVector[1] +
- dotProductVector[2] + dotProductVector[3] +
- dotProductVector[4] + dotProductVector[5] +
- dotProductVector[6] + dotProductVector[7];
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+ _mm256_storeu_ps(dotProductVector,
+ dotProdVal); // Store the results back into the dot product vector
+ _mm256_zeroupper();
- for(number = eighthPoints * 8; number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
+ float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
+ dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
+ dotProductVector[6] + dotProductVector[7];
- *result = dotProduct;
+ for (number = eighthPoints * 8; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+ *result = dotProduct;
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
#if LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float * result, const float * input, const float* taps, unsigned int num_points){
- unsigned int number;
- const unsigned int sixteenthPoints = num_points / 16;
-
- const float* aPtr = input;
- const float* bPtr = taps;
+static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
+ unsigned int number;
+ const unsigned int sixteenthPoints = num_points / 16;
- __m512 dotProdVal = _mm512_setzero_ps();
- __m512 aVal1, bVal1;
+ const float* aPtr = input;
+ const float* bPtr = taps;
- for (number = 0; number < sixteenthPoints; number++ ) {
+ __m512 dotProdVal = _mm512_setzero_ps();
+ __m512 aVal1, bVal1;
- aVal1 = _mm512_loadu_ps(aPtr);
- bVal1 = _mm512_loadu_ps(bPtr);
- aPtr += 16;
- bPtr += 16;
+ for (number = 0; number < sixteenthPoints; number++) {
- dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
- }
+ aVal1 = _mm512_loadu_ps(aPtr);
+ bVal1 = _mm512_loadu_ps(bPtr);
+ aPtr += 16;
+ bPtr += 16;
- __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
- _mm512_storeu_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+ dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
+ }
- float dotProduct =
- dotProductVector[0] + dotProductVector[1] +
- dotProductVector[2] + dotProductVector[3] +
- dotProductVector[4] + dotProductVector[5] +
- dotProductVector[6] + dotProductVector[7] +
- dotProductVector[8] + dotProductVector[9] +
- dotProductVector[10] + dotProductVector[11] +
- dotProductVector[12] + dotProductVector[13] +
- dotProductVector[14] + dotProductVector[15];
+ __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
+ _mm512_storeu_ps(dotProductVector,
+ dotProdVal); // Store the results back into the dot product vector
- for(number = sixteenthPoints * 16; number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
+ float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
+ dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
+ dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
+ dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
+ dotProductVector[12] + dotProductVector[13] +
+ dotProductVector[14] + dotProductVector[15];
- *result = dotProduct;
+ for (number = sixteenthPoints * 16; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+ *result = dotProduct;
}
#endif /* LV_HAVE_AVX512F */
#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
#define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
+#include <stdio.h>
#include <volk/volk_common.h>
-#include<stdio.h>
#ifdef LV_HAVE_GENERIC
-static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_a_generic(float* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr= taps;
- unsigned int number = 0;
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+ unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
+ for (number = 0; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
- *result = dotProduct;
+ *result = dotProduct;
}
#endif /*LV_HAVE_GENERIC*/
#ifdef LV_HAVE_SSE
-static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m128 a0Val, a1Val, a2Val, a3Val;
- __m128 b0Val, b1Val, b2Val, b3Val;
- __m128 c0Val, c1Val, c2Val, c3Val;
-
- __m128 dotProdVal0 = _mm_setzero_ps();
- __m128 dotProdVal1 = _mm_setzero_ps();
- __m128 dotProdVal2 = _mm_setzero_ps();
- __m128 dotProdVal3 = _mm_setzero_ps();
+static inline void volk_32f_x2_dot_prod_32f_a_sse(float* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
- for(;number < sixteenthPoints; number++){
-
- a0Val = _mm_load_ps(aPtr);
- a1Val = _mm_load_ps(aPtr+4);
- a2Val = _mm_load_ps(aPtr+8);
- a3Val = _mm_load_ps(aPtr+12);
- b0Val = _mm_load_ps(bPtr);
- b1Val = _mm_load_ps(bPtr+4);
- b2Val = _mm_load_ps(bPtr+8);
- b3Val = _mm_load_ps(bPtr+12);
-
- c0Val = _mm_mul_ps(a0Val, b0Val);
- c1Val = _mm_mul_ps(a1Val, b1Val);
- c2Val = _mm_mul_ps(a2Val, b2Val);
- c3Val = _mm_mul_ps(a3Val, b3Val);
-
- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- aPtr += 16;
- bPtr += 16;
- }
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for (; number < sixteenthPoints; number++) {
+
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr + 4);
+ a2Val = _mm_load_ps(aPtr + 8);
+ a3Val = _mm_load_ps(aPtr + 12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr + 4);
+ b2Val = _mm_load_ps(bPtr + 8);
+ b3Val = _mm_load_ps(bPtr + 12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+ aPtr += 16;
+ bPtr += 16;
+ }
- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
+ _mm_store_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
- number = sixteenthPoints*16;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
- *result = dotProduct;
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+ *result = dotProduct;
}
#endif /*LV_HAVE_SSE*/
#include <pmmintrin.h>
-static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m128 a0Val, a1Val, a2Val, a3Val;
- __m128 b0Val, b1Val, b2Val, b3Val;
- __m128 c0Val, c1Val, c2Val, c3Val;
-
- __m128 dotProdVal0 = _mm_setzero_ps();
- __m128 dotProdVal1 = _mm_setzero_ps();
- __m128 dotProdVal2 = _mm_setzero_ps();
- __m128 dotProdVal3 = _mm_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- a0Val = _mm_load_ps(aPtr);
- a1Val = _mm_load_ps(aPtr+4);
- a2Val = _mm_load_ps(aPtr+8);
- a3Val = _mm_load_ps(aPtr+12);
- b0Val = _mm_load_ps(bPtr);
- b1Val = _mm_load_ps(bPtr+4);
- b2Val = _mm_load_ps(bPtr+8);
- b3Val = _mm_load_ps(bPtr+12);
-
- c0Val = _mm_mul_ps(a0Val, b0Val);
- c1Val = _mm_mul_ps(a1Val, b1Val);
- c2Val = _mm_mul_ps(a2Val, b2Val);
- c3Val = _mm_mul_ps(a3Val, b3Val);
-
- dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
- dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
- dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
- dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
-
- aPtr += 16;
- bPtr += 16;
- }
-
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
-
- number = sixteenthPoints*16;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = dotProduct;
-}
-
-#endif /*LV_HAVE_SSE3*/
+static inline void volk_32f_x2_dot_prod_32f_a_sse3(float* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
-#ifdef LV_HAVE_SSE4_1
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for (; number < sixteenthPoints; number++) {
+
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr + 4);
+ a2Val = _mm_load_ps(aPtr + 8);
+ a3Val = _mm_load_ps(aPtr + 12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr + 4);
+ b2Val = _mm_load_ps(bPtr + 8);
+ b3Val = _mm_load_ps(bPtr + 12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+ dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+ dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+ dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
-#include <smmintrin.h>
+ aPtr += 16;
+ bPtr += 16;
+ }
-static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ _mm_store_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
- __m128 aVal1, bVal1, cVal1;
- __m128 aVal2, bVal2, cVal2;
- __m128 aVal3, bVal3, cVal3;
- __m128 aVal4, bVal4, cVal4;
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
- __m128 dotProdVal = _mm_setzero_ps();
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
- for(;number < sixteenthPoints; number++){
+ *result = dotProduct;
+}
- aVal1 = _mm_load_ps(aPtr); aPtr += 4;
- aVal2 = _mm_load_ps(aPtr); aPtr += 4;
- aVal3 = _mm_load_ps(aPtr); aPtr += 4;
- aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+#endif /*LV_HAVE_SSE3*/
- bVal1 = _mm_load_ps(bPtr); bPtr += 4;
- bVal2 = _mm_load_ps(bPtr); bPtr += 4;
- bVal3 = _mm_load_ps(bPtr); bPtr += 4;
- bVal4 = _mm_load_ps(bPtr); bPtr += 4;
+#ifdef LV_HAVE_SSE4_1
- cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
- cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
- cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
- cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+#include <smmintrin.h>
- cVal1 = _mm_or_ps(cVal1, cVal2);
- cVal3 = _mm_or_ps(cVal3, cVal4);
- cVal1 = _mm_or_ps(cVal1, cVal3);
+static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- dotProdVal = _mm_add_ps(dotProdVal, cVal1);
- }
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 aVal1, bVal1, cVal1;
+ __m128 aVal2, bVal2, cVal2;
+ __m128 aVal3, bVal3, cVal3;
+ __m128 aVal4, bVal4, cVal4;
+
+ __m128 dotProdVal = _mm_setzero_ps();
+
+ for (; number < sixteenthPoints; number++) {
+
+ aVal1 = _mm_load_ps(aPtr);
+ aPtr += 4;
+ aVal2 = _mm_load_ps(aPtr);
+ aPtr += 4;
+ aVal3 = _mm_load_ps(aPtr);
+ aPtr += 4;
+ aVal4 = _mm_load_ps(aPtr);
+ aPtr += 4;
+
+ bVal1 = _mm_load_ps(bPtr);
+ bPtr += 4;
+ bVal2 = _mm_load_ps(bPtr);
+ bPtr += 4;
+ bVal3 = _mm_load_ps(bPtr);
+ bPtr += 4;
+ bVal4 = _mm_load_ps(bPtr);
+ bPtr += 4;
+
+ cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+ cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+ cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+ cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+ cVal1 = _mm_or_ps(cVal1, cVal2);
+ cVal3 = _mm_or_ps(cVal3, cVal4);
+ cVal1 = _mm_or_ps(cVal1, cVal3);
+
+ dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+ }
- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
- _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ _mm_store_ps(dotProductVector,
+ dotProdVal); // Store the results back into the dot product vector
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
- number = sixteenthPoints * 16;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
- *result = dotProduct;
+ *result = dotProduct;
}
#endif /*LV_HAVE_SSE4_1*/
#include <immintrin.h>
-static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_a_avx(float* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- __m256 a0Val, a1Val;
- __m256 b0Val, b1Val;
- __m256 c0Val, c1Val;
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
- __m256 dotProdVal0 = _mm256_setzero_ps();
- __m256 dotProdVal1 = _mm256_setzero_ps();
+ __m256 a0Val, a1Val;
+ __m256 b0Val, b1Val;
+ __m256 c0Val, c1Val;
- for(;number < sixteenthPoints; number++){
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
- a0Val = _mm256_load_ps(aPtr);
- a1Val = _mm256_load_ps(aPtr+8);
- b0Val = _mm256_load_ps(bPtr);
- b1Val = _mm256_load_ps(bPtr+8);
+ for (; number < sixteenthPoints; number++) {
- c0Val = _mm256_mul_ps(a0Val, b0Val);
- c1Val = _mm256_mul_ps(a1Val, b1Val);
+ a0Val = _mm256_load_ps(aPtr);
+ a1Val = _mm256_load_ps(aPtr + 8);
+ b0Val = _mm256_load_ps(bPtr);
+ b1Val = _mm256_load_ps(bPtr + 8);
- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
- aPtr += 16;
- bPtr += 16;
- }
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+ aPtr += 16;
+ bPtr += 16;
+ }
- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
- dotProduct += dotProductVector[4];
- dotProduct += dotProductVector[5];
- dotProduct += dotProductVector[6];
- dotProduct += dotProductVector[7];
+ _mm256_store_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
- number = sixteenthPoints*16;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+ dotProduct += dotProductVector[4];
+ dotProduct += dotProductVector[5];
+ dotProduct += dotProductVector[6];
+ dotProduct += dotProductVector[7];
- *result = dotProduct;
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+ *result = dotProduct;
}
#endif /*LV_HAVE_AVX*/
#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include <immintrin.h>
-static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(float * result, const float * input, const float* taps, unsigned int num_points){
- unsigned int number;
- const unsigned int eighthPoints = num_points / 8;
-
- const float* aPtr = input;
- const float* bPtr = taps;
+static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(float* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
+ unsigned int number;
+ const unsigned int eighthPoints = num_points / 8;
- __m256 dotProdVal = _mm256_setzero_ps();
- __m256 aVal1, bVal1;
+ const float* aPtr = input;
+ const float* bPtr = taps;
- for (number = 0; number < eighthPoints; number++ ) {
+ __m256 dotProdVal = _mm256_setzero_ps();
+ __m256 aVal1, bVal1;
- aVal1 = _mm256_load_ps(aPtr);
- bVal1 = _mm256_load_ps(bPtr);
- aPtr += 8;
- bPtr += 8;
+ for (number = 0; number < eighthPoints; number++) {
- dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
- }
+ aVal1 = _mm256_load_ps(aPtr);
+ bVal1 = _mm256_load_ps(bPtr);
+ aPtr += 8;
+ bPtr += 8;
- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
- _mm256_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
- _mm256_zeroupper();
+ dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
+ }
- float dotProduct =
- dotProductVector[0] + dotProductVector[1] +
- dotProductVector[2] + dotProductVector[3] +
- dotProductVector[4] + dotProductVector[5] +
- dotProductVector[6] + dotProductVector[7];
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+ _mm256_store_ps(dotProductVector,
+ dotProdVal); // Store the results back into the dot product vector
+ _mm256_zeroupper();
- for(number = eighthPoints * 8; number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
+ float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
+ dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
+ dotProductVector[6] + dotProductVector[7];
- *result = dotProduct;
+ for (number = eighthPoints * 8; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+ *result = dotProduct;
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
#if LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void volk_32f_x2_dot_prod_32f_a_avx512f(float * result, const float * input, const float* taps, unsigned int num_points){
- unsigned int number;
- const unsigned int sixteenthPoints = num_points / 16;
-
- const float* aPtr = input;
- const float* bPtr = taps;
+static inline void volk_32f_x2_dot_prod_32f_a_avx512f(float* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
+ unsigned int number;
+ const unsigned int sixteenthPoints = num_points / 16;
- __m512 dotProdVal = _mm512_setzero_ps();
- __m512 aVal1, bVal1;
+ const float* aPtr = input;
+ const float* bPtr = taps;
- for (number = 0; number < sixteenthPoints; number++ ) {
+ __m512 dotProdVal = _mm512_setzero_ps();
+ __m512 aVal1, bVal1;
- aVal1 = _mm512_load_ps(aPtr);
- bVal1 = _mm512_load_ps(bPtr);
- aPtr += 16;
- bPtr += 16;
+ for (number = 0; number < sixteenthPoints; number++) {
- dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
- }
+ aVal1 = _mm512_load_ps(aPtr);
+ bVal1 = _mm512_load_ps(bPtr);
+ aPtr += 16;
+ bPtr += 16;
- __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
- _mm512_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+ dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
+ }
- float dotProduct =
- dotProductVector[0] + dotProductVector[1] +
- dotProductVector[2] + dotProductVector[3] +
- dotProductVector[4] + dotProductVector[5] +
- dotProductVector[6] + dotProductVector[7] +
- dotProductVector[8] + dotProductVector[9] +
- dotProductVector[10] + dotProductVector[11] +
- dotProductVector[12] + dotProductVector[13] +
- dotProductVector[14] + dotProductVector[15];
+ __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
+ _mm512_store_ps(dotProductVector,
+ dotProdVal); // Store the results back into the dot product vector
- for(number = sixteenthPoints * 16; number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
+ float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
+ dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
+ dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
+ dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
+ dotProductVector[12] + dotProductVector[13] +
+ dotProductVector[14] + dotProductVector[15];
- *result = dotProduct;
+ for (number = sixteenthPoints * 16; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+ *result = dotProduct;
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float * input, const float * taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_neonopts(float* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
unsigned int quarter_points = num_points / 16;
float dotProduct = 0;
const float* aPtr = input;
- const float* bPtr= taps;
+ const float* bPtr = taps;
unsigned int number = 0;
float32x4x4_t a_val, b_val, accumulator0;
accumulator0.val[3] = vdupq_n_f32(0);
// factor of 4 loop unroll with independent accumulators
// uses 12 out of 16 neon q registers
- for( number = 0; number < quarter_points; ++number) {
+ for (number = 0; number < quarter_points; ++number) {
a_val = vld4q_f32(aPtr);
b_val = vld4q_f32(bPtr);
accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
vst1q_f32(accumulator, accumulator0.val[0]);
dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
- for(number = quarter_points*16; number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
+ for (number = quarter_points * 16; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
}
*result = dotProduct;
#endif
-
-
#ifdef LV_HAVE_NEON
-static inline void volk_32f_x2_dot_prod_32f_neon(float * result, const float * input, const float * taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_neon(float* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
unsigned int quarter_points = num_points / 8;
float dotProduct = 0;
const float* aPtr = input;
- const float* bPtr= taps;
+ const float* bPtr = taps;
unsigned int number = 0;
float32x4x2_t a_val, b_val, accumulator_val;
accumulator_val.val[0] = vdupq_n_f32(0);
accumulator_val.val[1] = vdupq_n_f32(0);
// factor of 2 loop unroll with independent accumulators
- for( number = 0; number < quarter_points; ++number) {
+ for (number = 0; number < quarter_points; ++number) {
a_val = vld2q_f32(aPtr);
b_val = vld2q_f32(bPtr);
- accumulator_val.val[0] = vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
- accumulator_val.val[1] = vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
+ accumulator_val.val[0] =
+ vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
+ accumulator_val.val[1] =
+ vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
aPtr += 8;
bPtr += 8;
}
vst1q_f32(accumulator, accumulator_val.val[0]);
dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
- for(number = quarter_points*8; number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
+ for (number = quarter_points * 8; number < num_points; number++) {
+ dotProduct += ((*aPtr++) * (*bPtr++));
}
*result = dotProduct;
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_NEONV7
-extern void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+extern void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points);
#endif /* LV_HAVE_NEONV7 */
#ifdef LV_HAVE_NEONV7
-extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points);
#endif /* LV_HAVE_NEONV7 */
#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void volk_32f_x2_fm_detectpuppet_32f_a_avx(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points)
+static inline void volk_32f_x2_fm_detectpuppet_32f_a_avx(float* outputVector,
+ const float* inputVector,
+ float* saveValue,
+ unsigned int num_points)
{
- const float bound = 1.0f;
+ const float bound = 1.0f;
- volk_32f_s32f_32f_fm_detect_32f_a_avx(outputVector, inputVector, bound, saveValue, num_points);
+ volk_32f_s32f_32f_fm_detect_32f_a_avx(
+ outputVector, inputVector, bound, saveValue, num_points);
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void volk_32f_x2_fm_detectpuppet_32f_a_sse(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points)
+static inline void volk_32f_x2_fm_detectpuppet_32f_a_sse(float* outputVector,
+ const float* inputVector,
+ float* saveValue,
+ unsigned int num_points)
{
- const float bound = 1.0f;
+ const float bound = 1.0f;
- volk_32f_s32f_32f_fm_detect_32f_a_sse(outputVector, inputVector, bound, saveValue, num_points);
+ volk_32f_s32f_32f_fm_detect_32f_a_sse(
+ outputVector, inputVector, bound, saveValue, num_points);
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void volk_32f_x2_fm_detectpuppet_32f_generic(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points)
+static inline void volk_32f_x2_fm_detectpuppet_32f_generic(float* outputVector,
+ const float* inputVector,
+ float* saveValue,
+ unsigned int num_points)
{
- const float bound = 1.0f;
+ const float bound = 1.0f;
- volk_32f_s32f_32f_fm_detect_32f_generic(outputVector, inputVector, bound, saveValue, num_points);
+ volk_32f_s32f_32f_fm_detect_32f_generic(
+ outputVector, inputVector, bound, saveValue, num_points);
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void volk_32f_x2_fm_detectpuppet_32f_u_avx(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points)
+static inline void volk_32f_x2_fm_detectpuppet_32f_u_avx(float* outputVector,
+ const float* inputVector,
+ float* saveValue,
+ unsigned int num_points)
{
- const float bound = 1.0f;
+ const float bound = 1.0f;
- volk_32f_s32f_32f_fm_detect_32f_u_avx(outputVector, inputVector, bound, saveValue, num_points);
+ volk_32f_s32f_32f_fm_detect_32f_u_avx(
+ outputVector, inputVector, bound, saveValue, num_points);
}
#endif /* LV_HAVE_AVX */
#endif /* INCLUDED_volk_32f_x2_fm_detectpuppet_32f_u_H */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_x2_interleave_32fc(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points)
- * \endcode
+ * void volk_32f_x2_interleave_32fc(lv_32fc_t* complexVector, const float* iBuffer, const
+ * float* qBuffer, unsigned int num_points) \endcode
*
* \b Inputs
* \li iBuffer: Input vector of samples for the real part.
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_x2_interleave_32fc_a_avx(lv_32fc_t* complexVector, const float* iBuffer,
- const float* qBuffer, unsigned int num_points)
+static inline void volk_32f_x2_interleave_32fc_a_avx(lv_32fc_t* complexVector,
+ const float* iBuffer,
+ const float* qBuffer,
+ unsigned int num_points)
{
- unsigned int number = 0;
- float* complexVectorPtr = (float*)complexVector;
- const float* iBufferPtr = iBuffer;
- const float* qBufferPtr = qBuffer;
-
- const uint64_t eighthPoints = num_points / 8;
-
- __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue;
- for(;number < eighthPoints; number++){
- iValue = _mm256_load_ps(iBufferPtr);
- qValue = _mm256_load_ps(qBufferPtr);
-
- // Interleaves the lower two values in the i and q variables into one buffer
- cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
- // Interleaves the upper two values in the i and q variables into one buffer
- cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
-
- cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
- _mm256_store_ps(complexVectorPtr, cplxValue);
- complexVectorPtr += 8;
-
- cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
- _mm256_store_ps(complexVectorPtr, cplxValue);
- complexVectorPtr += 8;
-
- iBufferPtr += 8;
- qBufferPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- *complexVectorPtr++ = *iBufferPtr++;
- *complexVectorPtr++ = *qBufferPtr++;
- }
+ unsigned int number = 0;
+ float* complexVectorPtr = (float*)complexVector;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+
+ const uint64_t eighthPoints = num_points / 8;
+
+ __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue;
+ for (; number < eighthPoints; number++) {
+ iValue = _mm256_load_ps(iBufferPtr);
+ qValue = _mm256_load_ps(qBufferPtr);
+
+ // Interleaves the lower two values in the i and q variables into one buffer
+ cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
+ // Interleaves the upper two values in the i and q variables into one buffer
+ cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
+
+ cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+ _mm256_store_ps(complexVectorPtr, cplxValue);
+ complexVectorPtr += 8;
+
+ cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+ _mm256_store_ps(complexVectorPtr, cplxValue);
+ complexVectorPtr += 8;
+
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *complexVectorPtr++ = *iBufferPtr++;
+ *complexVectorPtr++ = *qBufferPtr++;
+ }
}
#endif /* LV_HAV_AVX */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, const float* iBuffer,
- const float* qBuffer, unsigned int num_points)
+static inline void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector,
+ const float* iBuffer,
+ const float* qBuffer,
+ unsigned int num_points)
{
- unsigned int number = 0;
- float* complexVectorPtr = (float*)complexVector;
- const float* iBufferPtr = iBuffer;
- const float* qBufferPtr = qBuffer;
-
- const uint64_t quarterPoints = num_points / 4;
-
- __m128 iValue, qValue, cplxValue;
- for(;number < quarterPoints; number++){
- iValue = _mm_load_ps(iBufferPtr);
- qValue = _mm_load_ps(qBufferPtr);
-
- // Interleaves the lower two values in the i and q variables into one buffer
- cplxValue = _mm_unpacklo_ps(iValue, qValue);
- _mm_store_ps(complexVectorPtr, cplxValue);
- complexVectorPtr += 4;
-
- // Interleaves the upper two values in the i and q variables into one buffer
- cplxValue = _mm_unpackhi_ps(iValue, qValue);
- _mm_store_ps(complexVectorPtr, cplxValue);
- complexVectorPtr += 4;
-
- iBufferPtr += 4;
- qBufferPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- *complexVectorPtr++ = *iBufferPtr++;
- *complexVectorPtr++ = *qBufferPtr++;
- }
+ unsigned int number = 0;
+ float* complexVectorPtr = (float*)complexVector;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+
+ const uint64_t quarterPoints = num_points / 4;
+
+ __m128 iValue, qValue, cplxValue;
+ for (; number < quarterPoints; number++) {
+ iValue = _mm_load_ps(iBufferPtr);
+ qValue = _mm_load_ps(qBufferPtr);
+
+ // Interleaves the lower two values in the i and q variables into one buffer
+ cplxValue = _mm_unpacklo_ps(iValue, qValue);
+ _mm_store_ps(complexVectorPtr, cplxValue);
+ complexVectorPtr += 4;
+
+ // Interleaves the upper two values in the i and q variables into one buffer
+ cplxValue = _mm_unpackhi_ps(iValue, qValue);
+ _mm_store_ps(complexVectorPtr, cplxValue);
+ complexVectorPtr += 4;
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *complexVectorPtr++ = *iBufferPtr++;
+ *complexVectorPtr++ = *qBufferPtr++;
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_32f_x2_interleave_32fc_neon(lv_32fc_t* complexVector, const float* iBuffer,
- const float* qBuffer, unsigned int num_points)
+static inline void volk_32f_x2_interleave_32fc_neon(lv_32fc_t* complexVector,
+ const float* iBuffer,
+ const float* qBuffer,
+ unsigned int num_points)
{
- unsigned int quarter_points = num_points / 4;
- unsigned int number;
- float* complexVectorPtr = (float*) complexVector;
-
- float32x4x2_t complex_vec;
- for(number=0; number < quarter_points; ++number) {
- complex_vec.val[0] = vld1q_f32(iBuffer);
- complex_vec.val[1] = vld1q_f32(qBuffer);
- vst2q_f32(complexVectorPtr, complex_vec);
- iBuffer += 4;
- qBuffer += 4;
- complexVectorPtr += 8;
- }
-
- for(number=quarter_points * 4; number < num_points; ++number) {
- *complexVectorPtr++ = *iBuffer++;
- *complexVectorPtr++ = *qBuffer++;
- }
+ unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+ float* complexVectorPtr = (float*)complexVector;
+
+ float32x4x2_t complex_vec;
+ for (number = 0; number < quarter_points; ++number) {
+ complex_vec.val[0] = vld1q_f32(iBuffer);
+ complex_vec.val[1] = vld1q_f32(qBuffer);
+ vst2q_f32(complexVectorPtr, complex_vec);
+ iBuffer += 4;
+ qBuffer += 4;
+ complexVectorPtr += 8;
+ }
+
+ for (number = quarter_points * 4; number < num_points; ++number) {
+ *complexVectorPtr++ = *iBuffer++;
+ *complexVectorPtr++ = *qBuffer++;
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_x2_interleave_32fc_generic(lv_32fc_t* complexVector, const float* iBuffer,
- const float* qBuffer, unsigned int num_points)
+static inline void volk_32f_x2_interleave_32fc_generic(lv_32fc_t* complexVector,
+ const float* iBuffer,
+ const float* qBuffer,
+ unsigned int num_points)
{
- float* complexVectorPtr = (float*)complexVector;
- const float* iBufferPtr = iBuffer;
- const float* qBufferPtr = qBuffer;
- unsigned int number;
-
- for(number = 0; number < num_points; number++){
- *complexVectorPtr++ = *iBufferPtr++;
- *complexVectorPtr++ = *qBufferPtr++;
- }
+ float* complexVectorPtr = (float*)complexVector;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+ unsigned int number;
+
+ for (number = 0; number < num_points; number++) {
+ *complexVectorPtr++ = *iBufferPtr++;
+ *complexVectorPtr++ = *qBufferPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
-
#endif /* INCLUDED_volk_32f_x2_interleave_32fc_a_H */
#ifndef INCLUDED_volk_32f_x2_interleave_32fc_u_H
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_x2_interleave_32fc_u_avx(lv_32fc_t* complexVector, const float* iBuffer,
- const float* qBuffer, unsigned int num_points)
+static inline void volk_32f_x2_interleave_32fc_u_avx(lv_32fc_t* complexVector,
+ const float* iBuffer,
+ const float* qBuffer,
+ unsigned int num_points)
{
- unsigned int number = 0;
- float* complexVectorPtr = (float*)complexVector;
- const float* iBufferPtr = iBuffer;
- const float* qBufferPtr = qBuffer;
-
- const uint64_t eighthPoints = num_points / 8;
-
- __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue;
- for(;number < eighthPoints; number++){
- iValue = _mm256_loadu_ps(iBufferPtr);
- qValue = _mm256_loadu_ps(qBufferPtr);
-
- // Interleaves the lower two values in the i and q variables into one buffer
- cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
- // Interleaves the upper two values in the i and q variables into one buffer
- cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
-
- cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
- _mm256_storeu_ps(complexVectorPtr, cplxValue);
- complexVectorPtr += 8;
-
- cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
- _mm256_storeu_ps(complexVectorPtr, cplxValue);
- complexVectorPtr += 8;
-
- iBufferPtr += 8;
- qBufferPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- *complexVectorPtr++ = *iBufferPtr++;
- *complexVectorPtr++ = *qBufferPtr++;
- }
+ unsigned int number = 0;
+ float* complexVectorPtr = (float*)complexVector;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+
+ const uint64_t eighthPoints = num_points / 8;
+
+ __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue;
+ for (; number < eighthPoints; number++) {
+ iValue = _mm256_loadu_ps(iBufferPtr);
+ qValue = _mm256_loadu_ps(qBufferPtr);
+
+ // Interleaves the lower two values in the i and q variables into one buffer
+ cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
+ // Interleaves the upper two values in the i and q variables into one buffer
+ cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
+
+ cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+ _mm256_storeu_ps(complexVectorPtr, cplxValue);
+ complexVectorPtr += 8;
+
+ cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+ _mm256_storeu_ps(complexVectorPtr, cplxValue);
+ complexVectorPtr += 8;
+
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *complexVectorPtr++ = *iBufferPtr++;
+ *complexVectorPtr++ = *qBufferPtr++;
+ }
}
#endif /* LV_HAVE_AVX */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_x2_max_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
- * \endcode
+ * void volk_32f_x2_max_32f(float* cVector, const float* aVector, const float* bVector,
+ * unsigned int num_points) \endcode
*
* \b Inputs
* \li aVector: First input vector.
#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void
-volk_32f_x2_max_32f_a_avx512f(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_max_32f_a_avx512f(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m512 aVal, bVal, cVal;
- for(;number < sixteenthPoints; number++){
- aVal = _mm512_load_ps(aPtr);
- bVal = _mm512_load_ps(bPtr);
+ __m512 aVal, bVal, cVal;
+ for (; number < sixteenthPoints; number++) {
+ aVal = _mm512_load_ps(aPtr);
+ bVal = _mm512_load_ps(bPtr);
- cVal = _mm512_max_ps(aVal, bVal);
+ cVal = _mm512_max_ps(aVal, bVal);
- _mm512_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 16;
- bPtr += 16;
- cPtr += 16;
- }
+ aPtr += 16;
+ bPtr += 16;
+ cPtr += 16;
+ }
- number = sixteenthPoints * 16;
- for(;number < num_points; number++){
- const float a = *aPtr++;
- const float b = *bPtr++;
- *cPtr++ = ( a > b ? a : b);
- }
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = (a > b ? a : b);
+ }
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32f_x2_max_32f_a_sse(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_max_32f_a_sse(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m128 aVal, bVal, cVal;
- for(;number < quarterPoints; number++){
- aVal = _mm_load_ps(aPtr);
- bVal = _mm_load_ps(bPtr);
+ __m128 aVal, bVal, cVal;
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
- cVal = _mm_max_ps(aVal, bVal);
+ cVal = _mm_max_ps(aVal, bVal);
- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- const float a = *aPtr++;
- const float b = *bPtr++;
- *cPtr++ = ( a > b ? a : b);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = (a > b ? a : b);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_x2_max_32f_a_avx(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_max_32f_a_avx(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m256 aVal, bVal, cVal;
- for(;number < eighthPoints; number++){
- aVal = _mm256_load_ps(aPtr);
- bVal = _mm256_load_ps(bPtr);
+ __m256 aVal, bVal, cVal;
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_load_ps(aPtr);
+ bVal = _mm256_load_ps(bPtr);
- cVal = _mm256_max_ps(aVal, bVal);
+ cVal = _mm256_max_ps(aVal, bVal);
- _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- const float a = *aPtr++;
- const float b = *bPtr++;
- *cPtr++ = ( a > b ? a : b);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = (a > b ? a : b);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_32f_x2_max_32f_neon(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_max_32f_neon(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int quarter_points = num_points / 4;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
- unsigned int number = 0;
-
- float32x4_t a_vec, b_vec, c_vec;
- for(number = 0; number < quarter_points; number++){
- a_vec = vld1q_f32(aPtr);
- b_vec = vld1q_f32(bPtr);
- c_vec = vmaxq_f32(a_vec, b_vec);
- vst1q_f32(cPtr, c_vec);
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
-
- for(number = quarter_points*4; number < num_points; number++){
- const float a = *aPtr++;
- const float b = *bPtr++;
- *cPtr++ = ( a > b ? a : b);
- }
+ unsigned int quarter_points = num_points / 4;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
+ unsigned int number = 0;
+
+ float32x4_t a_vec, b_vec, c_vec;
+ for (number = 0; number < quarter_points; number++) {
+ a_vec = vld1q_f32(aPtr);
+ b_vec = vld1q_f32(bPtr);
+ c_vec = vmaxq_f32(a_vec, b_vec);
+ vst1q_f32(cPtr, c_vec);
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ for (number = quarter_points * 4; number < num_points; number++) {
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = (a > b ? a : b);
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_x2_max_32f_generic(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_max_32f_generic(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- const float a = *aPtr++;
- const float b = *bPtr++;
- *cPtr++ = ( a > b ? a : b);
- }
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = (a > b ? a : b);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC
-extern void
-volk_32f_x2_max_32f_a_orc_impl(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points);
-
-static inline void
-volk_32f_x2_max_32f_u_orc(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+extern void volk_32f_x2_max_32f_a_orc_impl(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points);
+
+static inline void volk_32f_x2_max_32f_u_orc(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+ volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_ORC */
#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void
-volk_32f_x2_max_32f_u_avx512f(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_max_32f_u_avx512f(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m512 aVal, bVal, cVal;
- for(;number < sixteenthPoints; number++){
- aVal = _mm512_loadu_ps(aPtr);
- bVal = _mm512_loadu_ps(bPtr);
+ __m512 aVal, bVal, cVal;
+ for (; number < sixteenthPoints; number++) {
+ aVal = _mm512_loadu_ps(aPtr);
+ bVal = _mm512_loadu_ps(bPtr);
- cVal = _mm512_max_ps(aVal, bVal);
+ cVal = _mm512_max_ps(aVal, bVal);
- _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container
+ _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 16;
- bPtr += 16;
- cPtr += 16;
- }
+ aPtr += 16;
+ bPtr += 16;
+ cPtr += 16;
+ }
- number = sixteenthPoints * 16;
- for(;number < num_points; number++){
- const float a = *aPtr++;
- const float b = *bPtr++;
- *cPtr++ = ( a > b ? a : b);
- }
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = (a > b ? a : b);
+ }
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_x2_max_32f_u_avx(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_max_32f_u_avx(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m256 aVal, bVal, cVal;
- for(;number < eighthPoints; number++){
- aVal = _mm256_loadu_ps(aPtr);
- bVal = _mm256_loadu_ps(bPtr);
+ __m256 aVal, bVal, cVal;
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_loadu_ps(aPtr);
+ bVal = _mm256_loadu_ps(bPtr);
- cVal = _mm256_max_ps(aVal, bVal);
+ cVal = _mm256_max_ps(aVal, bVal);
- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- const float a = *aPtr++;
- const float b = *bPtr++;
- *cPtr++ = ( a > b ? a : b);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = (a > b ? a : b);
+ }
}
#endif /* LV_HAVE_AVX */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_x2_min_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
- * \endcode
+ * void volk_32f_x2_min_32f(float* cVector, const float* aVector, const float* bVector,
+ * unsigned int num_points) \endcode
*
* \b Inputs
* \li aVector: First input vector.
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_min_32f_a_sse(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m128 aVal, bVal, cVal;
- for(;number < quarterPoints; number++){
- aVal = _mm_load_ps(aPtr);
- bVal = _mm_load_ps(bPtr);
+ __m128 aVal, bVal, cVal;
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
- cVal = _mm_min_ps(aVal, bVal);
+ cVal = _mm_min_ps(aVal, bVal);
- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- const float a = *aPtr++;
- const float b = *bPtr++;
- *cPtr++ = ( a < b ? a : b);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = (a < b ? a : b);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_32f_x2_min_32f_neon(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_min_32f_neon(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
- unsigned int number = 0;
- unsigned int quarter_points = num_points / 4;
-
- float32x4_t a_vec, b_vec, c_vec;
- for(number = 0; number < quarter_points; number++){
- a_vec = vld1q_f32(aPtr);
- b_vec = vld1q_f32(bPtr);
-
- c_vec = vminq_f32(a_vec, b_vec);
-
- vst1q_f32(cPtr, c_vec);
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
-
- for(number = quarter_points*4; number < num_points; number++){
- const float a = *aPtr++;
- const float b = *bPtr++;
- *cPtr++ = ( a < b ? a : b);
- }
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+
+ float32x4_t a_vec, b_vec, c_vec;
+ for (number = 0; number < quarter_points; number++) {
+ a_vec = vld1q_f32(aPtr);
+ b_vec = vld1q_f32(bPtr);
+
+ c_vec = vminq_f32(a_vec, b_vec);
+
+ vst1q_f32(cPtr, c_vec);
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ for (number = quarter_points * 4; number < num_points; number++) {
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = (a < b ? a : b);
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_x2_min_32f_generic(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_min_32f_generic(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- const float a = *aPtr++;
- const float b = *bPtr++;
- *cPtr++ = ( a < b ? a : b);
- }
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = (a < b ? a : b);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC
-extern void
-volk_32f_x2_min_32f_a_orc_impl(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points);
+extern void volk_32f_x2_min_32f_a_orc_impl(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points);
-static inline void
-volk_32f_x2_min_32f_u_orc(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_min_32f_u_orc(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- volk_32f_x2_min_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+ volk_32f_x2_min_32f_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_ORC */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_x2_min_32f_a_avx(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_min_32f_a_avx(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m256 aVal, bVal, cVal;
- for(;number < eighthPoints; number++){
- aVal = _mm256_load_ps(aPtr);
- bVal = _mm256_load_ps(bPtr);
+ __m256 aVal, bVal, cVal;
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_load_ps(aPtr);
+ bVal = _mm256_load_ps(bPtr);
- cVal = _mm256_min_ps(aVal, bVal);
+ cVal = _mm256_min_ps(aVal, bVal);
- _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- const float a = *aPtr++;
- const float b = *bPtr++;
- *cPtr++ = ( a < b ? a : b);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = (a < b ? a : b);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void
-volk_32f_x2_min_32f_a_avx512f(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_min_32f_a_avx512f(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m512 aVal, bVal, cVal;
- for(;number < sixteenthPoints; number++){
- aVal = _mm512_load_ps(aPtr);
- bVal = _mm512_load_ps(bPtr);
+ __m512 aVal, bVal, cVal;
+ for (; number < sixteenthPoints; number++) {
+ aVal = _mm512_load_ps(aPtr);
+ bVal = _mm512_load_ps(bPtr);
- cVal = _mm512_min_ps(aVal, bVal);
+ cVal = _mm512_min_ps(aVal, bVal);
- _mm512_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 16;
- bPtr += 16;
- cPtr += 16;
- }
+ aPtr += 16;
+ bPtr += 16;
+ cPtr += 16;
+ }
- number = sixteenthPoints * 16;
- for(;number < num_points; number++){
- const float a = *aPtr++;
- const float b = *bPtr++;
- *cPtr++ = ( a < b ? a : b);
- }
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = (a < b ? a : b);
+ }
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void
-volk_32f_x2_min_32f_u_avx512f(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_min_32f_u_avx512f(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m512 aVal, bVal, cVal;
- for(;number < sixteenthPoints; number++){
- aVal = _mm512_loadu_ps(aPtr);
- bVal = _mm512_loadu_ps(bPtr);
+ __m512 aVal, bVal, cVal;
+ for (; number < sixteenthPoints; number++) {
+ aVal = _mm512_loadu_ps(aPtr);
+ bVal = _mm512_loadu_ps(bPtr);
- cVal = _mm512_min_ps(aVal, bVal);
+ cVal = _mm512_min_ps(aVal, bVal);
- _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container
+ _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 16;
- bPtr += 16;
- cPtr += 16;
- }
+ aPtr += 16;
+ bPtr += 16;
+ cPtr += 16;
+ }
- number = sixteenthPoints * 16;
- for(;number < num_points; number++){
- const float a = *aPtr++;
- const float b = *bPtr++;
- *cPtr++ = ( a < b ? a : b);
- }
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = (a < b ? a : b);
+ }
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_x2_min_32f_u_avx(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_min_32f_u_avx(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m256 aVal, bVal, cVal;
- for(;number < eighthPoints; number++){
- aVal = _mm256_loadu_ps(aPtr);
- bVal = _mm256_loadu_ps(bPtr);
+ __m256 aVal, bVal, cVal;
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_loadu_ps(aPtr);
+ bVal = _mm256_loadu_ps(bPtr);
- cVal = _mm256_min_ps(aVal, bVal);
+ cVal = _mm256_min_ps(aVal, bVal);
- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- const float a = *aPtr++;
- const float b = *bPtr++;
- *cPtr++ = ( a < b ? a : b);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = (a < b ? a : b);
+ }
}
#endif /* LV_HAVE_AVX */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_x2_multiply_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
- * \endcode
+ * void volk_32f_x2_multiply_32f(float* cVector, const float* aVector, const float*
+ * bVector, unsigned int num_points) \endcode
*
* \b Inputs
* \li aVector: First input vector.
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m128 aVal, bVal, cVal;
- for(;number < quarterPoints; number++){
+ __m128 aVal, bVal, cVal;
+ for (; number < quarterPoints; number++) {
- aVal = _mm_loadu_ps(aPtr);
- bVal = _mm_loadu_ps(bPtr);
+ aVal = _mm_loadu_ps(aPtr);
+ bVal = _mm_loadu_ps(bPtr);
- cVal = _mm_mul_ps(aVal, bVal);
+ cVal = _mm_mul_ps(aVal, bVal);
- _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+ _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) * (*bPtr++);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void
-volk_32f_x2_multiply_32f_u_avx512f(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_multiply_32f_u_avx512f(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m512 aVal, bVal, cVal;
- for(;number < sixteenthPoints; number++){
+ __m512 aVal, bVal, cVal;
+ for (; number < sixteenthPoints; number++) {
- aVal = _mm512_loadu_ps(aPtr);
- bVal = _mm512_loadu_ps(bPtr);
+ aVal = _mm512_loadu_ps(aPtr);
+ bVal = _mm512_loadu_ps(bPtr);
- cVal = _mm512_mul_ps(aVal, bVal);
+ cVal = _mm512_mul_ps(aVal, bVal);
- _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container
+ _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 16;
- bPtr += 16;
- cPtr += 16;
- }
+ aPtr += 16;
+ bPtr += 16;
+ cPtr += 16;
+ }
- number = sixteenthPoints * 16;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) * (*bPtr++);
- }
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m256 aVal, bVal, cVal;
- for(;number < eighthPoints; number++){
+ __m256 aVal, bVal, cVal;
+ for (; number < eighthPoints; number++) {
- aVal = _mm256_loadu_ps(aPtr);
- bVal = _mm256_loadu_ps(bPtr);
+ aVal = _mm256_loadu_ps(aPtr);
+ bVal = _mm256_loadu_ps(bPtr);
- cVal = _mm256_mul_ps(aVal, bVal);
+ cVal = _mm256_mul_ps(aVal, bVal);
- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) * (*bPtr++);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_x2_multiply_32f_generic(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_multiply_32f_generic(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++) * (*bPtr++);
- }
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32f_x2_multiply_32f_a_sse(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_multiply_32f_a_sse(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m128 aVal, bVal, cVal;
- for(;number < quarterPoints; number++){
+ __m128 aVal, bVal, cVal;
+ for (; number < quarterPoints; number++) {
- aVal = _mm_load_ps(aPtr);
- bVal = _mm_load_ps(bPtr);
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
- cVal = _mm_mul_ps(aVal, bVal);
+ cVal = _mm_mul_ps(aVal, bVal);
- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) * (*bPtr++);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void
-volk_32f_x2_multiply_32f_a_avx512f(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_multiply_32f_a_avx512f(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m512 aVal, bVal, cVal;
- for(;number < sixteenthPoints; number++){
+ __m512 aVal, bVal, cVal;
+ for (; number < sixteenthPoints; number++) {
- aVal = _mm512_load_ps(aPtr);
- bVal = _mm512_load_ps(bPtr);
+ aVal = _mm512_load_ps(aPtr);
+ bVal = _mm512_load_ps(bPtr);
- cVal = _mm512_mul_ps(aVal, bVal);
+ cVal = _mm512_mul_ps(aVal, bVal);
- _mm512_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 16;
- bPtr += 16;
- cPtr += 16;
- }
+ aPtr += 16;
+ bPtr += 16;
+ cPtr += 16;
+ }
- number = sixteenthPoints * 16;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) * (*bPtr++);
- }
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_x2_multiply_32f_a_avx(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_multiply_32f_a_avx(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m256 aVal, bVal, cVal;
- for(;number < eighthPoints; number++){
+ __m256 aVal, bVal, cVal;
+ for (; number < eighthPoints; number++) {
- aVal = _mm256_load_ps(aPtr);
- bVal = _mm256_load_ps(bPtr);
+ aVal = _mm256_load_ps(aPtr);
+ bVal = _mm256_load_ps(bPtr);
- cVal = _mm256_mul_ps(aVal, bVal);
+ cVal = _mm256_mul_ps(aVal, bVal);
- _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) * (*bPtr++);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_32f_x2_multiply_32f_neon(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_multiply_32f_neon(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- const unsigned int quarter_points = num_points / 4;
- unsigned int number;
- float32x4_t avec, bvec, cvec;
- for(number=0; number < quarter_points; ++number) {
- avec = vld1q_f32(aVector);
- bvec = vld1q_f32(bVector);
- cvec = vmulq_f32(avec, bvec);
- vst1q_f32(cVector, cvec);
- aVector += 4;
- bVector += 4;
- cVector += 4;
- }
- for(number=quarter_points*4; number < num_points; ++number) {
- *cVector++ = *aVector++ * *bVector++;
- }
+ const unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+ float32x4_t avec, bvec, cvec;
+ for (number = 0; number < quarter_points; ++number) {
+ avec = vld1q_f32(aVector);
+ bvec = vld1q_f32(bVector);
+ cvec = vmulq_f32(avec, bvec);
+ vst1q_f32(cVector, cvec);
+ aVector += 4;
+ bVector += 4;
+ cVector += 4;
+ }
+ for (number = quarter_points * 4; number < num_points; ++number) {
+ *cVector++ = *aVector++ * *bVector++;
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_x2_multiply_32f_a_generic(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_multiply_32f_a_generic(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++) * (*bPtr++);
- }
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC
-extern void
-volk_32f_x2_multiply_32f_a_orc_impl(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points);
-
-static inline void
-volk_32f_x2_multiply_32f_u_orc(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+extern void volk_32f_x2_multiply_32f_a_orc_impl(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points);
+
+static inline void volk_32f_x2_multiply_32f_u_orc(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+ volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_ORC */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_x2_pow_32f(float* cVector, const float* bVector, const float* aVector, unsigned int num_points)
- * \endcode
+ * void volk_32f_x2_pow_32f(float* cVector, const float* bVector, const float* aVector,
+ * unsigned int num_points) \endcode
*
* \b Inputs
* \li bVector: The input vector of indices (power values).
#ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
#define INCLUDED_volk_32f_x2_pow_32f_a_H
-#include <stdio.h>
-#include <stdlib.h>
#include <inttypes.h>
#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
#define POW_POLY_DEGREE 3
#include <immintrin.h>
#define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
-#define POLY1_AVX2_FMA(x, c0, c1) _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
-#define POLY2_AVX2_FMA(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
-#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
-#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
-#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
-
-static inline void
-volk_32f_x2_pow_32f_a_avx2_fma(float* cVector, const float* bVector,
- const float* aVector, unsigned int num_points)
+#define POLY1_AVX2_FMA(x, c0, c1) \
+ _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
+#define POLY2_AVX2_FMA(x, c0, c1, c2) \
+ _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
+#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
+ _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
+#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
+ _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
+#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
+ _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
+
+static inline void volk_32f_x2_pow_32f_a_avx2_fma(float* cVector,
+ const float* bVector,
+ const float* aVector,
+ unsigned int num_points)
{
- float* cPtr = cVector;
- const float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
- __m256 tmp, fx, mask, pow2n, z, y;
- __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
- __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
- __m256i bias, exp, emm0, pi32_0x7f;
-
- one = _mm256_set1_ps(1.0);
- exp_hi = _mm256_set1_ps(88.3762626647949);
- exp_lo = _mm256_set1_ps(-88.3762626647949);
- ln2 = _mm256_set1_ps(0.6931471805);
- log2EF = _mm256_set1_ps(1.44269504088896341);
- half = _mm256_set1_ps(0.5);
- exp_C1 = _mm256_set1_ps(0.693359375);
- exp_C2 = _mm256_set1_ps(-2.12194440e-4);
- pi32_0x7f = _mm256_set1_epi32(0x7f);
-
- exp_p0 = _mm256_set1_ps(1.9875691500e-4);
- exp_p1 = _mm256_set1_ps(1.3981999507e-3);
- exp_p2 = _mm256_set1_ps(8.3334519073e-3);
- exp_p3 = _mm256_set1_ps(4.1665795894e-2);
- exp_p4 = _mm256_set1_ps(1.6666665459e-1);
- exp_p5 = _mm256_set1_ps(5.0000001201e-1);
-
- for(;number < eighthPoints; number++){
- // First compute the logarithm
- aVal = _mm256_load_ps(aPtr);
- bias = _mm256_set1_epi32(127);
- leadingOne = _mm256_set1_ps(1.0f);
- exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
- logarithm = _mm256_cvtepi32_ps(exp);
-
- frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+ float* cPtr = cVector;
+ const float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+ __m256 tmp, fx, mask, pow2n, z, y;
+ __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+ __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+ __m256i bias, exp, emm0, pi32_0x7f;
+
+ one = _mm256_set1_ps(1.0);
+ exp_hi = _mm256_set1_ps(88.3762626647949);
+ exp_lo = _mm256_set1_ps(-88.3762626647949);
+ ln2 = _mm256_set1_ps(0.6931471805);
+ log2EF = _mm256_set1_ps(1.44269504088896341);
+ half = _mm256_set1_ps(0.5);
+ exp_C1 = _mm256_set1_ps(0.693359375);
+ exp_C2 = _mm256_set1_ps(-2.12194440e-4);
+ pi32_0x7f = _mm256_set1_epi32(0x7f);
+
+ exp_p0 = _mm256_set1_ps(1.9875691500e-4);
+ exp_p1 = _mm256_set1_ps(1.3981999507e-3);
+ exp_p2 = _mm256_set1_ps(8.3334519073e-3);
+ exp_p3 = _mm256_set1_ps(4.1665795894e-2);
+ exp_p4 = _mm256_set1_ps(1.6666665459e-1);
+ exp_p5 = _mm256_set1_ps(5.0000001201e-1);
+
+ for (; number < eighthPoints; number++) {
+ // First compute the logarithm
+ aVal = _mm256_load_ps(aPtr);
+ bias = _mm256_set1_epi32(127);
+ leadingOne = _mm256_set1_ps(1.0f);
+ exp = _mm256_sub_epi32(
+ _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
+ _mm256_set1_epi32(0x7f800000)),
+ 23),
+ bias);
+ logarithm = _mm256_cvtepi32_ps(exp);
+
+ frac = _mm256_or_ps(
+ leadingOne,
+ _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
#if POW_POLY_DEGREE == 6
- mantissa = POLY5_AVX2_FMA( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+ mantissa = POLY5_AVX2_FMA(frac,
+ 3.1157899f,
+ -3.3241990f,
+ 2.5988452f,
+ -1.2315303f,
+ 3.1821337e-1f,
+ -3.4436006e-2f);
#elif POW_POLY_DEGREE == 5
- mantissa = POLY4_AVX2_FMA( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+ mantissa = POLY4_AVX2_FMA(frac,
+ 2.8882704548164776201f,
+ -2.52074962577807006663f,
+ 1.48116647521213171641f,
+ -0.465725644288844778798f,
+ 0.0596515482674574969533f);
#elif POW_POLY_DEGREE == 4
- mantissa = POLY3_AVX2_FMA( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+ mantissa = POLY3_AVX2_FMA(frac,
+ 2.61761038894603480148f,
+ -1.75647175389045657003f,
+ 0.688243882994381274313f,
+ -0.107254423828329604454f);
#elif POW_POLY_DEGREE == 3
- mantissa = POLY2_AVX2_FMA( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+ mantissa = POLY2_AVX2_FMA(frac,
+ 2.28330284476918490682f,
+ -1.04913055217340124191f,
+ 0.204446009836232697516f);
#else
#error
#endif
- logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
- logarithm = _mm256_mul_ps(logarithm, ln2);
+ logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
+ logarithm = _mm256_mul_ps(logarithm, ln2);
- // Now calculate b*lna
- bVal = _mm256_load_ps(bPtr);
- bVal = _mm256_mul_ps(bVal, logarithm);
+ // Now calculate b*lna
+ bVal = _mm256_load_ps(bPtr);
+ bVal = _mm256_mul_ps(bVal, logarithm);
- // Now compute exp(b*lna)
- bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
+ // Now compute exp(b*lna)
+ bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
- fx = _mm256_fmadd_ps(bVal, log2EF, half);
+ fx = _mm256_fmadd_ps(bVal, log2EF, half);
- emm0 = _mm256_cvttps_epi32(fx);
- tmp = _mm256_cvtepi32_ps(emm0);
+ emm0 = _mm256_cvttps_epi32(fx);
+ tmp = _mm256_cvtepi32_ps(emm0);
- mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
- fx = _mm256_sub_ps(tmp, mask);
+ mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
+ fx = _mm256_sub_ps(tmp, mask);
- tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
- bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
- z = _mm256_mul_ps(bVal, bVal);
+ tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
+ bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
+ z = _mm256_mul_ps(bVal, bVal);
- y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
- y = _mm256_fmadd_ps(y, bVal, exp_p2);
- y = _mm256_fmadd_ps(y, bVal, exp_p3);
- y = _mm256_fmadd_ps(y, bVal, exp_p4);
- y = _mm256_fmadd_ps(y, bVal, exp_p5);
- y = _mm256_fmadd_ps(y, z, bVal);
- y = _mm256_add_ps(y, one);
+ y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
+ y = _mm256_fmadd_ps(y, bVal, exp_p2);
+ y = _mm256_fmadd_ps(y, bVal, exp_p3);
+ y = _mm256_fmadd_ps(y, bVal, exp_p4);
+ y = _mm256_fmadd_ps(y, bVal, exp_p5);
+ y = _mm256_fmadd_ps(y, z, bVal);
+ y = _mm256_add_ps(y, one);
- emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
+ emm0 =
+ _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
pow2n = _mm256_castsi256_ps(emm0);
cVal = _mm256_mul_ps(y, pow2n);
aPtr += 8;
bPtr += 8;
cPtr += 8;
- }
+ }
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *cPtr++ = pow(*aPtr++, *bPtr++);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *cPtr++ = pow(*aPtr++, *bPtr++);
+ }
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
#include <immintrin.h>
#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
-#define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
-#define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
-#define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
-#define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
-#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
-
-static inline void
-volk_32f_x2_pow_32f_a_avx2(float* cVector, const float* bVector,
- const float* aVector, unsigned int num_points)
+#define POLY1_AVX2(x, c0, c1) \
+ _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
+#define POLY2_AVX2(x, c0, c1, c2) \
+ _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
+#define POLY3_AVX2(x, c0, c1, c2, c3) \
+ _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
+#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
+ _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
+#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
+ _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
+
+static inline void volk_32f_x2_pow_32f_a_avx2(float* cVector,
+ const float* bVector,
+ const float* aVector,
+ unsigned int num_points)
{
- float* cPtr = cVector;
- const float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
- __m256 tmp, fx, mask, pow2n, z, y;
- __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
- __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
- __m256i bias, exp, emm0, pi32_0x7f;
-
- one = _mm256_set1_ps(1.0);
- exp_hi = _mm256_set1_ps(88.3762626647949);
- exp_lo = _mm256_set1_ps(-88.3762626647949);
- ln2 = _mm256_set1_ps(0.6931471805);
- log2EF = _mm256_set1_ps(1.44269504088896341);
- half = _mm256_set1_ps(0.5);
- exp_C1 = _mm256_set1_ps(0.693359375);
- exp_C2 = _mm256_set1_ps(-2.12194440e-4);
- pi32_0x7f = _mm256_set1_epi32(0x7f);
-
- exp_p0 = _mm256_set1_ps(1.9875691500e-4);
- exp_p1 = _mm256_set1_ps(1.3981999507e-3);
- exp_p2 = _mm256_set1_ps(8.3334519073e-3);
- exp_p3 = _mm256_set1_ps(4.1665795894e-2);
- exp_p4 = _mm256_set1_ps(1.6666665459e-1);
- exp_p5 = _mm256_set1_ps(5.0000001201e-1);
-
- for(;number < eighthPoints; number++){
- // First compute the logarithm
- aVal = _mm256_load_ps(aPtr);
- bias = _mm256_set1_epi32(127);
- leadingOne = _mm256_set1_ps(1.0f);
- exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
- logarithm = _mm256_cvtepi32_ps(exp);
-
- frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+ float* cPtr = cVector;
+ const float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+ __m256 tmp, fx, mask, pow2n, z, y;
+ __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+ __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+ __m256i bias, exp, emm0, pi32_0x7f;
+
+ one = _mm256_set1_ps(1.0);
+ exp_hi = _mm256_set1_ps(88.3762626647949);
+ exp_lo = _mm256_set1_ps(-88.3762626647949);
+ ln2 = _mm256_set1_ps(0.6931471805);
+ log2EF = _mm256_set1_ps(1.44269504088896341);
+ half = _mm256_set1_ps(0.5);
+ exp_C1 = _mm256_set1_ps(0.693359375);
+ exp_C2 = _mm256_set1_ps(-2.12194440e-4);
+ pi32_0x7f = _mm256_set1_epi32(0x7f);
+
+ exp_p0 = _mm256_set1_ps(1.9875691500e-4);
+ exp_p1 = _mm256_set1_ps(1.3981999507e-3);
+ exp_p2 = _mm256_set1_ps(8.3334519073e-3);
+ exp_p3 = _mm256_set1_ps(4.1665795894e-2);
+ exp_p4 = _mm256_set1_ps(1.6666665459e-1);
+ exp_p5 = _mm256_set1_ps(5.0000001201e-1);
+
+ for (; number < eighthPoints; number++) {
+ // First compute the logarithm
+ aVal = _mm256_load_ps(aPtr);
+ bias = _mm256_set1_epi32(127);
+ leadingOne = _mm256_set1_ps(1.0f);
+ exp = _mm256_sub_epi32(
+ _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
+ _mm256_set1_epi32(0x7f800000)),
+ 23),
+ bias);
+ logarithm = _mm256_cvtepi32_ps(exp);
+
+ frac = _mm256_or_ps(
+ leadingOne,
+ _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
#if POW_POLY_DEGREE == 6
- mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+ mantissa = POLY5_AVX2(frac,
+ 3.1157899f,
+ -3.3241990f,
+ 2.5988452f,
+ -1.2315303f,
+ 3.1821337e-1f,
+ -3.4436006e-2f);
#elif POW_POLY_DEGREE == 5
- mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+ mantissa = POLY4_AVX2(frac,
+ 2.8882704548164776201f,
+ -2.52074962577807006663f,
+ 1.48116647521213171641f,
+ -0.465725644288844778798f,
+ 0.0596515482674574969533f);
#elif POW_POLY_DEGREE == 4
- mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+ mantissa = POLY3_AVX2(frac,
+ 2.61761038894603480148f,
+ -1.75647175389045657003f,
+ 0.688243882994381274313f,
+ -0.107254423828329604454f);
#elif POW_POLY_DEGREE == 3
- mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+ mantissa = POLY2_AVX2(frac,
+ 2.28330284476918490682f,
+ -1.04913055217340124191f,
+ 0.204446009836232697516f);
#else
#error
#endif
- logarithm = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
- logarithm = _mm256_mul_ps(logarithm, ln2);
+ logarithm = _mm256_add_ps(
+ _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
+ logarithm = _mm256_mul_ps(logarithm, ln2);
- // Now calculate b*lna
- bVal = _mm256_load_ps(bPtr);
- bVal = _mm256_mul_ps(bVal, logarithm);
+ // Now calculate b*lna
+ bVal = _mm256_load_ps(bPtr);
+ bVal = _mm256_mul_ps(bVal, logarithm);
- // Now compute exp(b*lna)
- bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
+ // Now compute exp(b*lna)
+ bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
- fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
+ fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
- emm0 = _mm256_cvttps_epi32(fx);
- tmp = _mm256_cvtepi32_ps(emm0);
+ emm0 = _mm256_cvttps_epi32(fx);
+ tmp = _mm256_cvtepi32_ps(emm0);
- mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
- fx = _mm256_sub_ps(tmp, mask);
+ mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
+ fx = _mm256_sub_ps(tmp, mask);
- tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
- bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
- z = _mm256_mul_ps(bVal, bVal);
+ tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
+ bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
+ z = _mm256_mul_ps(bVal, bVal);
- y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
- y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
- y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
- y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
- y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
- y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
- y = _mm256_add_ps(y, one);
+ y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
+ y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
+ y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
+ y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
+ y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
+ y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
+ y = _mm256_add_ps(y, one);
- emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
+ emm0 =
+ _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
pow2n = _mm256_castsi256_ps(emm0);
cVal = _mm256_mul_ps(y, pow2n);
aPtr += 8;
bPtr += 8;
cPtr += 8;
- }
+ }
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *cPtr++ = pow(*aPtr++, *bPtr++);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *cPtr++ = pow(*aPtr++, *bPtr++);
+ }
}
#endif /* LV_HAVE_AVX2 for aligned */
#define POLY0(x, c0) _mm_set1_ps(c0)
#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
-#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
-#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
-#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
-
-static inline void
-volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float* bVector,
- const float* aVector, unsigned int num_points)
+#define POLY3(x, c0, c1, c2, c3) \
+ _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) \
+ _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) \
+ _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector,
+ const float* bVector,
+ const float* aVector,
+ unsigned int num_points)
{
- float* cPtr = cVector;
- const float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
- __m128 tmp, fx, mask, pow2n, z, y;
- __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
- __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
- __m128i bias, exp, emm0, pi32_0x7f;
-
- one = _mm_set1_ps(1.0);
- exp_hi = _mm_set1_ps(88.3762626647949);
- exp_lo = _mm_set1_ps(-88.3762626647949);
- ln2 = _mm_set1_ps(0.6931471805);
- log2EF = _mm_set1_ps(1.44269504088896341);
- half = _mm_set1_ps(0.5);
- exp_C1 = _mm_set1_ps(0.693359375);
- exp_C2 = _mm_set1_ps(-2.12194440e-4);
- pi32_0x7f = _mm_set1_epi32(0x7f);
-
- exp_p0 = _mm_set1_ps(1.9875691500e-4);
- exp_p1 = _mm_set1_ps(1.3981999507e-3);
- exp_p2 = _mm_set1_ps(8.3334519073e-3);
- exp_p3 = _mm_set1_ps(4.1665795894e-2);
- exp_p4 = _mm_set1_ps(1.6666665459e-1);
- exp_p5 = _mm_set1_ps(5.0000001201e-1);
-
- for(;number < quarterPoints; number++){
- // First compute the logarithm
- aVal = _mm_load_ps(aPtr);
- bias = _mm_set1_epi32(127);
- leadingOne = _mm_set1_ps(1.0f);
- exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
- logarithm = _mm_cvtepi32_ps(exp);
-
- frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+ float* cPtr = cVector;
+ const float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+ __m128 tmp, fx, mask, pow2n, z, y;
+ __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+ __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+ __m128i bias, exp, emm0, pi32_0x7f;
+
+ one = _mm_set1_ps(1.0);
+ exp_hi = _mm_set1_ps(88.3762626647949);
+ exp_lo = _mm_set1_ps(-88.3762626647949);
+ ln2 = _mm_set1_ps(0.6931471805);
+ log2EF = _mm_set1_ps(1.44269504088896341);
+ half = _mm_set1_ps(0.5);
+ exp_C1 = _mm_set1_ps(0.693359375);
+ exp_C2 = _mm_set1_ps(-2.12194440e-4);
+ pi32_0x7f = _mm_set1_epi32(0x7f);
+
+ exp_p0 = _mm_set1_ps(1.9875691500e-4);
+ exp_p1 = _mm_set1_ps(1.3981999507e-3);
+ exp_p2 = _mm_set1_ps(8.3334519073e-3);
+ exp_p3 = _mm_set1_ps(4.1665795894e-2);
+ exp_p4 = _mm_set1_ps(1.6666665459e-1);
+ exp_p5 = _mm_set1_ps(5.0000001201e-1);
+
+ for (; number < quarterPoints; number++) {
+ // First compute the logarithm
+ aVal = _mm_load_ps(aPtr);
+ bias = _mm_set1_epi32(127);
+ leadingOne = _mm_set1_ps(1.0f);
+ exp = _mm_sub_epi32(
+ _mm_srli_epi32(
+ _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
+ bias);
+ logarithm = _mm_cvtepi32_ps(exp);
+
+ frac = _mm_or_ps(leadingOne,
+ _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
#if POW_POLY_DEGREE == 6
- mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+ mantissa = POLY5(frac,
+ 3.1157899f,
+ -3.3241990f,
+ 2.5988452f,
+ -1.2315303f,
+ 3.1821337e-1f,
+ -3.4436006e-2f);
#elif POW_POLY_DEGREE == 5
- mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+ mantissa = POLY4(frac,
+ 2.8882704548164776201f,
+ -2.52074962577807006663f,
+ 1.48116647521213171641f,
+ -0.465725644288844778798f,
+ 0.0596515482674574969533f);
#elif POW_POLY_DEGREE == 4
- mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+ mantissa = POLY3(frac,
+ 2.61761038894603480148f,
+ -1.75647175389045657003f,
+ 0.688243882994381274313f,
+ -0.107254423828329604454f);
#elif POW_POLY_DEGREE == 3
- mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+ mantissa = POLY2(frac,
+ 2.28330284476918490682f,
+ -1.04913055217340124191f,
+ 0.204446009836232697516f);
#else
#error
#endif
- logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
- logarithm = _mm_mul_ps(logarithm, ln2);
+ logarithm =
+ _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
+ logarithm = _mm_mul_ps(logarithm, ln2);
- // Now calculate b*lna
- bVal = _mm_load_ps(bPtr);
- bVal = _mm_mul_ps(bVal, logarithm);
+ // Now calculate b*lna
+ bVal = _mm_load_ps(bPtr);
+ bVal = _mm_mul_ps(bVal, logarithm);
- // Now compute exp(b*lna)
- bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
+ // Now compute exp(b*lna)
+ bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
- fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
+ fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
- emm0 = _mm_cvttps_epi32(fx);
- tmp = _mm_cvtepi32_ps(emm0);
+ emm0 = _mm_cvttps_epi32(fx);
+ tmp = _mm_cvtepi32_ps(emm0);
- mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
- fx = _mm_sub_ps(tmp, mask);
+ mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
+ fx = _mm_sub_ps(tmp, mask);
- tmp = _mm_mul_ps(fx, exp_C1);
- z = _mm_mul_ps(fx, exp_C2);
- bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
- z = _mm_mul_ps(bVal, bVal);
+ tmp = _mm_mul_ps(fx, exp_C1);
+ z = _mm_mul_ps(fx, exp_C2);
+ bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
+ z = _mm_mul_ps(bVal, bVal);
- y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
- y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
- y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
- y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
- y = _mm_add_ps(y, one);
+ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
+ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
+ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
+ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
+ y = _mm_add_ps(y, one);
- emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
+ emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
pow2n = _mm_castsi128_ps(emm0);
cVal = _mm_mul_ps(y, pow2n);
aPtr += 4;
bPtr += 4;
cPtr += 4;
- }
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *cPtr++ = powf(*aPtr++, *bPtr++);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *cPtr++ = powf(*aPtr++, *bPtr++);
+ }
}
#endif /* LV_HAVE_SSE4_1 for aligned */
#ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
#define INCLUDED_volk_32f_x2_pow_32f_u_H
-#include <stdio.h>
-#include <stdlib.h>
#include <inttypes.h>
#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
#define POW_POLY_DEGREE 3
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_x2_pow_32f_generic(float* cVector, const float* bVector,
- const float* aVector, unsigned int num_points)
+static inline void volk_32f_x2_pow_32f_generic(float* cVector,
+ const float* bVector,
+ const float* aVector,
+ unsigned int num_points)
{
- float* cPtr = cVector;
- const float* bPtr = bVector;
- const float* aPtr = aVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = powf(*aPtr++, *bPtr++);
- }
+ float* cPtr = cVector;
+ const float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = powf(*aPtr++, *bPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#define POLY0(x, c0) _mm_set1_ps(c0)
#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
-#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
-#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
-#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
-
-static inline void
-volk_32f_x2_pow_32f_u_sse4_1(float* cVector, const float* bVector,
- const float* aVector, unsigned int num_points)
+#define POLY3(x, c0, c1, c2, c3) \
+ _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) \
+ _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) \
+ _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+static inline void volk_32f_x2_pow_32f_u_sse4_1(float* cVector,
+ const float* bVector,
+ const float* aVector,
+ unsigned int num_points)
{
- float* cPtr = cVector;
- const float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
- __m128 tmp, fx, mask, pow2n, z, y;
- __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
- __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
- __m128i bias, exp, emm0, pi32_0x7f;
-
- one = _mm_set1_ps(1.0);
- exp_hi = _mm_set1_ps(88.3762626647949);
- exp_lo = _mm_set1_ps(-88.3762626647949);
- ln2 = _mm_set1_ps(0.6931471805);
- log2EF = _mm_set1_ps(1.44269504088896341);
- half = _mm_set1_ps(0.5);
- exp_C1 = _mm_set1_ps(0.693359375);
- exp_C2 = _mm_set1_ps(-2.12194440e-4);
- pi32_0x7f = _mm_set1_epi32(0x7f);
-
- exp_p0 = _mm_set1_ps(1.9875691500e-4);
- exp_p1 = _mm_set1_ps(1.3981999507e-3);
- exp_p2 = _mm_set1_ps(8.3334519073e-3);
- exp_p3 = _mm_set1_ps(4.1665795894e-2);
- exp_p4 = _mm_set1_ps(1.6666665459e-1);
- exp_p5 = _mm_set1_ps(5.0000001201e-1);
-
- for(;number < quarterPoints; number++){
- // First compute the logarithm
- aVal = _mm_loadu_ps(aPtr);
- bias = _mm_set1_epi32(127);
- leadingOne = _mm_set1_ps(1.0f);
- exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
- logarithm = _mm_cvtepi32_ps(exp);
-
- frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+ float* cPtr = cVector;
+ const float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+ __m128 tmp, fx, mask, pow2n, z, y;
+ __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+ __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+ __m128i bias, exp, emm0, pi32_0x7f;
+
+ one = _mm_set1_ps(1.0);
+ exp_hi = _mm_set1_ps(88.3762626647949);
+ exp_lo = _mm_set1_ps(-88.3762626647949);
+ ln2 = _mm_set1_ps(0.6931471805);
+ log2EF = _mm_set1_ps(1.44269504088896341);
+ half = _mm_set1_ps(0.5);
+ exp_C1 = _mm_set1_ps(0.693359375);
+ exp_C2 = _mm_set1_ps(-2.12194440e-4);
+ pi32_0x7f = _mm_set1_epi32(0x7f);
+
+ exp_p0 = _mm_set1_ps(1.9875691500e-4);
+ exp_p1 = _mm_set1_ps(1.3981999507e-3);
+ exp_p2 = _mm_set1_ps(8.3334519073e-3);
+ exp_p3 = _mm_set1_ps(4.1665795894e-2);
+ exp_p4 = _mm_set1_ps(1.6666665459e-1);
+ exp_p5 = _mm_set1_ps(5.0000001201e-1);
+
+ for (; number < quarterPoints; number++) {
+ // First compute the logarithm
+ aVal = _mm_loadu_ps(aPtr);
+ bias = _mm_set1_epi32(127);
+ leadingOne = _mm_set1_ps(1.0f);
+ exp = _mm_sub_epi32(
+ _mm_srli_epi32(
+ _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
+ bias);
+ logarithm = _mm_cvtepi32_ps(exp);
+
+ frac = _mm_or_ps(leadingOne,
+ _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
#if POW_POLY_DEGREE == 6
- mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+ mantissa = POLY5(frac,
+ 3.1157899f,
+ -3.3241990f,
+ 2.5988452f,
+ -1.2315303f,
+ 3.1821337e-1f,
+ -3.4436006e-2f);
#elif POW_POLY_DEGREE == 5
- mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+ mantissa = POLY4(frac,
+ 2.8882704548164776201f,
+ -2.52074962577807006663f,
+ 1.48116647521213171641f,
+ -0.465725644288844778798f,
+ 0.0596515482674574969533f);
#elif POW_POLY_DEGREE == 4
- mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+ mantissa = POLY3(frac,
+ 2.61761038894603480148f,
+ -1.75647175389045657003f,
+ 0.688243882994381274313f,
+ -0.107254423828329604454f);
#elif POW_POLY_DEGREE == 3
- mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+ mantissa = POLY2(frac,
+ 2.28330284476918490682f,
+ -1.04913055217340124191f,
+ 0.204446009836232697516f);
#else
#error
#endif
- logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
- logarithm = _mm_mul_ps(logarithm, ln2);
+ logarithm =
+ _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
+ logarithm = _mm_mul_ps(logarithm, ln2);
- // Now calculate b*lna
- bVal = _mm_loadu_ps(bPtr);
- bVal = _mm_mul_ps(bVal, logarithm);
+ // Now calculate b*lna
+ bVal = _mm_loadu_ps(bPtr);
+ bVal = _mm_mul_ps(bVal, logarithm);
- // Now compute exp(b*lna)
- bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
+ // Now compute exp(b*lna)
+ bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
- fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
+ fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
- emm0 = _mm_cvttps_epi32(fx);
- tmp = _mm_cvtepi32_ps(emm0);
+ emm0 = _mm_cvttps_epi32(fx);
+ tmp = _mm_cvtepi32_ps(emm0);
- mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
- fx = _mm_sub_ps(tmp, mask);
+ mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
+ fx = _mm_sub_ps(tmp, mask);
- tmp = _mm_mul_ps(fx, exp_C1);
- z = _mm_mul_ps(fx, exp_C2);
- bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
- z = _mm_mul_ps(bVal, bVal);
+ tmp = _mm_mul_ps(fx, exp_C1);
+ z = _mm_mul_ps(fx, exp_C2);
+ bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
+ z = _mm_mul_ps(bVal, bVal);
- y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
- y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
- y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
- y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
- y = _mm_add_ps(y, one);
+ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
+ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
+ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
+ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
+ y = _mm_add_ps(y, one);
- emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
+ emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
- pow2n = _mm_castsi128_ps(emm0);
- cVal = _mm_mul_ps(y, pow2n);
+ pow2n = _mm_castsi128_ps(emm0);
+ cVal = _mm_mul_ps(y, pow2n);
- _mm_storeu_ps(cPtr, cVal);
+ _mm_storeu_ps(cPtr, cVal);
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *cPtr++ = powf(*aPtr++, *bPtr++);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *cPtr++ = powf(*aPtr++, *bPtr++);
+ }
}
#endif /* LV_HAVE_SSE4_1 for unaligned */
#include <immintrin.h>
#define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
-#define POLY1_AVX2_FMA(x, c0, c1) _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
-#define POLY2_AVX2_FMA(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
-#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
-#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
-#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
-
-static inline void
-volk_32f_x2_pow_32f_u_avx2_fma(float* cVector, const float* bVector,
- const float* aVector, unsigned int num_points)
+#define POLY1_AVX2_FMA(x, c0, c1) \
+ _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
+#define POLY2_AVX2_FMA(x, c0, c1, c2) \
+ _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
+#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
+ _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
+#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
+ _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
+#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
+ _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
+
+static inline void volk_32f_x2_pow_32f_u_avx2_fma(float* cVector,
+ const float* bVector,
+ const float* aVector,
+ unsigned int num_points)
{
- float* cPtr = cVector;
- const float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
- __m256 tmp, fx, mask, pow2n, z, y;
- __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
- __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
- __m256i bias, exp, emm0, pi32_0x7f;
-
- one = _mm256_set1_ps(1.0);
- exp_hi = _mm256_set1_ps(88.3762626647949);
- exp_lo = _mm256_set1_ps(-88.3762626647949);
- ln2 = _mm256_set1_ps(0.6931471805);
- log2EF = _mm256_set1_ps(1.44269504088896341);
- half = _mm256_set1_ps(0.5);
- exp_C1 = _mm256_set1_ps(0.693359375);
- exp_C2 = _mm256_set1_ps(-2.12194440e-4);
- pi32_0x7f = _mm256_set1_epi32(0x7f);
-
- exp_p0 = _mm256_set1_ps(1.9875691500e-4);
- exp_p1 = _mm256_set1_ps(1.3981999507e-3);
- exp_p2 = _mm256_set1_ps(8.3334519073e-3);
- exp_p3 = _mm256_set1_ps(4.1665795894e-2);
- exp_p4 = _mm256_set1_ps(1.6666665459e-1);
- exp_p5 = _mm256_set1_ps(5.0000001201e-1);
-
- for(;number < eighthPoints; number++){
- // First compute the logarithm
- aVal = _mm256_loadu_ps(aPtr);
- bias = _mm256_set1_epi32(127);
- leadingOne = _mm256_set1_ps(1.0f);
- exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
- logarithm = _mm256_cvtepi32_ps(exp);
-
- frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+ float* cPtr = cVector;
+ const float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+ __m256 tmp, fx, mask, pow2n, z, y;
+ __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+ __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+ __m256i bias, exp, emm0, pi32_0x7f;
+
+ one = _mm256_set1_ps(1.0);
+ exp_hi = _mm256_set1_ps(88.3762626647949);
+ exp_lo = _mm256_set1_ps(-88.3762626647949);
+ ln2 = _mm256_set1_ps(0.6931471805);
+ log2EF = _mm256_set1_ps(1.44269504088896341);
+ half = _mm256_set1_ps(0.5);
+ exp_C1 = _mm256_set1_ps(0.693359375);
+ exp_C2 = _mm256_set1_ps(-2.12194440e-4);
+ pi32_0x7f = _mm256_set1_epi32(0x7f);
+
+ exp_p0 = _mm256_set1_ps(1.9875691500e-4);
+ exp_p1 = _mm256_set1_ps(1.3981999507e-3);
+ exp_p2 = _mm256_set1_ps(8.3334519073e-3);
+ exp_p3 = _mm256_set1_ps(4.1665795894e-2);
+ exp_p4 = _mm256_set1_ps(1.6666665459e-1);
+ exp_p5 = _mm256_set1_ps(5.0000001201e-1);
+
+ for (; number < eighthPoints; number++) {
+ // First compute the logarithm
+ aVal = _mm256_loadu_ps(aPtr);
+ bias = _mm256_set1_epi32(127);
+ leadingOne = _mm256_set1_ps(1.0f);
+ exp = _mm256_sub_epi32(
+ _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
+ _mm256_set1_epi32(0x7f800000)),
+ 23),
+ bias);
+ logarithm = _mm256_cvtepi32_ps(exp);
+
+ frac = _mm256_or_ps(
+ leadingOne,
+ _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
#if POW_POLY_DEGREE == 6
- mantissa = POLY5_AVX2_FMA( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+ mantissa = POLY5_AVX2_FMA(frac,
+ 3.1157899f,
+ -3.3241990f,
+ 2.5988452f,
+ -1.2315303f,
+ 3.1821337e-1f,
+ -3.4436006e-2f);
#elif POW_POLY_DEGREE == 5
- mantissa = POLY4_AVX2_FMA( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+ mantissa = POLY4_AVX2_FMA(frac,
+ 2.8882704548164776201f,
+ -2.52074962577807006663f,
+ 1.48116647521213171641f,
+ -0.465725644288844778798f,
+ 0.0596515482674574969533f);
#elif POW_POLY_DEGREE == 4
- mantissa = POLY3_AVX2_FMA( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+ mantissa = POLY3_AVX2_FMA(frac,
+ 2.61761038894603480148f,
+ -1.75647175389045657003f,
+ 0.688243882994381274313f,
+ -0.107254423828329604454f);
#elif POW_POLY_DEGREE == 3
- mantissa = POLY2_AVX2_FMA( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+ mantissa = POLY2_AVX2_FMA(frac,
+ 2.28330284476918490682f,
+ -1.04913055217340124191f,
+ 0.204446009836232697516f);
#else
#error
#endif
- logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
- logarithm = _mm256_mul_ps(logarithm, ln2);
+ logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
+ logarithm = _mm256_mul_ps(logarithm, ln2);
- // Now calculate b*lna
- bVal = _mm256_loadu_ps(bPtr);
- bVal = _mm256_mul_ps(bVal, logarithm);
+ // Now calculate b*lna
+ bVal = _mm256_loadu_ps(bPtr);
+ bVal = _mm256_mul_ps(bVal, logarithm);
- // Now compute exp(b*lna)
- bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
+ // Now compute exp(b*lna)
+ bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
- fx = _mm256_fmadd_ps(bVal, log2EF, half);
+ fx = _mm256_fmadd_ps(bVal, log2EF, half);
- emm0 = _mm256_cvttps_epi32(fx);
- tmp = _mm256_cvtepi32_ps(emm0);
+ emm0 = _mm256_cvttps_epi32(fx);
+ tmp = _mm256_cvtepi32_ps(emm0);
- mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
- fx = _mm256_sub_ps(tmp, mask);
+ mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
+ fx = _mm256_sub_ps(tmp, mask);
- tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
- bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
- z = _mm256_mul_ps(bVal, bVal);
+ tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
+ bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
+ z = _mm256_mul_ps(bVal, bVal);
- y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
- y = _mm256_fmadd_ps(y, bVal, exp_p2);
- y = _mm256_fmadd_ps(y, bVal, exp_p3);
- y = _mm256_fmadd_ps(y, bVal, exp_p4);
- y = _mm256_fmadd_ps(y, bVal, exp_p5);
- y = _mm256_fmadd_ps(y, z, bVal);
- y = _mm256_add_ps(y, one);
+ y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
+ y = _mm256_fmadd_ps(y, bVal, exp_p2);
+ y = _mm256_fmadd_ps(y, bVal, exp_p3);
+ y = _mm256_fmadd_ps(y, bVal, exp_p4);
+ y = _mm256_fmadd_ps(y, bVal, exp_p5);
+ y = _mm256_fmadd_ps(y, z, bVal);
+ y = _mm256_add_ps(y, one);
- emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
+ emm0 =
+ _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
pow2n = _mm256_castsi256_ps(emm0);
cVal = _mm256_mul_ps(y, pow2n);
aPtr += 8;
bPtr += 8;
cPtr += 8;
- }
+ }
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *cPtr++ = pow(*aPtr++, *bPtr++);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *cPtr++ = pow(*aPtr++, *bPtr++);
+ }
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
#include <immintrin.h>
#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
-#define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
-#define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
-#define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
-#define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
-#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
-
-static inline void
-volk_32f_x2_pow_32f_u_avx2(float* cVector, const float* bVector,
- const float* aVector, unsigned int num_points)
+#define POLY1_AVX2(x, c0, c1) \
+ _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
+#define POLY2_AVX2(x, c0, c1, c2) \
+ _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
+#define POLY3_AVX2(x, c0, c1, c2, c3) \
+ _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
+#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
+ _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
+#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
+ _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
+
+static inline void volk_32f_x2_pow_32f_u_avx2(float* cVector,
+ const float* bVector,
+ const float* aVector,
+ unsigned int num_points)
{
- float* cPtr = cVector;
- const float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
- __m256 tmp, fx, mask, pow2n, z, y;
- __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
- __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
- __m256i bias, exp, emm0, pi32_0x7f;
-
- one = _mm256_set1_ps(1.0);
- exp_hi = _mm256_set1_ps(88.3762626647949);
- exp_lo = _mm256_set1_ps(-88.3762626647949);
- ln2 = _mm256_set1_ps(0.6931471805);
- log2EF = _mm256_set1_ps(1.44269504088896341);
- half = _mm256_set1_ps(0.5);
- exp_C1 = _mm256_set1_ps(0.693359375);
- exp_C2 = _mm256_set1_ps(-2.12194440e-4);
- pi32_0x7f = _mm256_set1_epi32(0x7f);
-
- exp_p0 = _mm256_set1_ps(1.9875691500e-4);
- exp_p1 = _mm256_set1_ps(1.3981999507e-3);
- exp_p2 = _mm256_set1_ps(8.3334519073e-3);
- exp_p3 = _mm256_set1_ps(4.1665795894e-2);
- exp_p4 = _mm256_set1_ps(1.6666665459e-1);
- exp_p5 = _mm256_set1_ps(5.0000001201e-1);
-
- for(;number < eighthPoints; number++){
- // First compute the logarithm
- aVal = _mm256_loadu_ps(aPtr);
- bias = _mm256_set1_epi32(127);
- leadingOne = _mm256_set1_ps(1.0f);
- exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
- logarithm = _mm256_cvtepi32_ps(exp);
-
- frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+ float* cPtr = cVector;
+ const float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+ __m256 tmp, fx, mask, pow2n, z, y;
+ __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+ __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+ __m256i bias, exp, emm0, pi32_0x7f;
+
+ one = _mm256_set1_ps(1.0);
+ exp_hi = _mm256_set1_ps(88.3762626647949);
+ exp_lo = _mm256_set1_ps(-88.3762626647949);
+ ln2 = _mm256_set1_ps(0.6931471805);
+ log2EF = _mm256_set1_ps(1.44269504088896341);
+ half = _mm256_set1_ps(0.5);
+ exp_C1 = _mm256_set1_ps(0.693359375);
+ exp_C2 = _mm256_set1_ps(-2.12194440e-4);
+ pi32_0x7f = _mm256_set1_epi32(0x7f);
+
+ exp_p0 = _mm256_set1_ps(1.9875691500e-4);
+ exp_p1 = _mm256_set1_ps(1.3981999507e-3);
+ exp_p2 = _mm256_set1_ps(8.3334519073e-3);
+ exp_p3 = _mm256_set1_ps(4.1665795894e-2);
+ exp_p4 = _mm256_set1_ps(1.6666665459e-1);
+ exp_p5 = _mm256_set1_ps(5.0000001201e-1);
+
+ for (; number < eighthPoints; number++) {
+ // First compute the logarithm
+ aVal = _mm256_loadu_ps(aPtr);
+ bias = _mm256_set1_epi32(127);
+ leadingOne = _mm256_set1_ps(1.0f);
+ exp = _mm256_sub_epi32(
+ _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
+ _mm256_set1_epi32(0x7f800000)),
+ 23),
+ bias);
+ logarithm = _mm256_cvtepi32_ps(exp);
+
+ frac = _mm256_or_ps(
+ leadingOne,
+ _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
#if POW_POLY_DEGREE == 6
- mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+ mantissa = POLY5_AVX2(frac,
+ 3.1157899f,
+ -3.3241990f,
+ 2.5988452f,
+ -1.2315303f,
+ 3.1821337e-1f,
+ -3.4436006e-2f);
#elif POW_POLY_DEGREE == 5
- mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+ mantissa = POLY4_AVX2(frac,
+ 2.8882704548164776201f,
+ -2.52074962577807006663f,
+ 1.48116647521213171641f,
+ -0.465725644288844778798f,
+ 0.0596515482674574969533f);
#elif POW_POLY_DEGREE == 4
- mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+ mantissa = POLY3_AVX2(frac,
+ 2.61761038894603480148f,
+ -1.75647175389045657003f,
+ 0.688243882994381274313f,
+ -0.107254423828329604454f);
#elif POW_POLY_DEGREE == 3
- mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+ mantissa = POLY2_AVX2(frac,
+ 2.28330284476918490682f,
+ -1.04913055217340124191f,
+ 0.204446009836232697516f);
#else
#error
#endif
- logarithm = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
- logarithm = _mm256_mul_ps(logarithm, ln2);
+ logarithm = _mm256_add_ps(
+ _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
+ logarithm = _mm256_mul_ps(logarithm, ln2);
- // Now calculate b*lna
- bVal = _mm256_loadu_ps(bPtr);
- bVal = _mm256_mul_ps(bVal, logarithm);
+ // Now calculate b*lna
+ bVal = _mm256_loadu_ps(bPtr);
+ bVal = _mm256_mul_ps(bVal, logarithm);
- // Now compute exp(b*lna)
- bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
+ // Now compute exp(b*lna)
+ bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
- fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
+ fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
- emm0 = _mm256_cvttps_epi32(fx);
- tmp = _mm256_cvtepi32_ps(emm0);
+ emm0 = _mm256_cvttps_epi32(fx);
+ tmp = _mm256_cvtepi32_ps(emm0);
- mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
- fx = _mm256_sub_ps(tmp, mask);
+ mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
+ fx = _mm256_sub_ps(tmp, mask);
- tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
- bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
- z = _mm256_mul_ps(bVal, bVal);
+ tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
+ bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
+ z = _mm256_mul_ps(bVal, bVal);
- y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
- y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
- y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
- y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
- y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
- y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
- y = _mm256_add_ps(y, one);
+ y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
+ y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
+ y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
+ y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
+ y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
+ y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
+ y = _mm256_add_ps(y, one);
- emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
+ emm0 =
+ _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
pow2n = _mm256_castsi256_ps(emm0);
cVal = _mm256_mul_ps(y, pow2n);
aPtr += 8;
bPtr += 8;
cPtr += 8;
- }
+ }
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *cPtr++ = pow(*aPtr++, *bPtr++);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *cPtr++ = pow(*aPtr++, *bPtr++);
+ }
}
#endif /* LV_HAVE_AVX2 for unaligned */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_x2_s32f_interleave_16ic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points)
- * \endcode
+ * void volk_32f_x2_s32f_interleave_16ic(lv_16sc_t* complexVector, const float* iBuffer,
+ * const float* qBuffer, const float scalar, unsigned int num_points) \endcode
*
* \b Inputs
* \li iBuffer: Input vector of samples for the real part.
#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
-#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_32f_x2_s32f_interleave_16ic_a_avx2(lv_16sc_t* complexVector, const float* iBuffer,
- const float* qBuffer, const float scalar, unsigned int num_points)
+static inline void volk_32f_x2_s32f_interleave_16ic_a_avx2(lv_16sc_t* complexVector,
+ const float* iBuffer,
+ const float* qBuffer,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const float* iBufferPtr = iBuffer;
- const float* qBufferPtr = qBuffer;
+ unsigned int number = 0;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
- __m256 vScalar = _mm256_set1_ps(scalar);
+ __m256 vScalar = _mm256_set1_ps(scalar);
- const unsigned int eighthPoints = num_points / 8;
+ const unsigned int eighthPoints = num_points / 8;
- __m256 iValue, qValue, cplxValue1, cplxValue2;
- __m256i intValue1, intValue2;
+ __m256 iValue, qValue, cplxValue1, cplxValue2;
+ __m256i intValue1, intValue2;
- int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
- for(;number < eighthPoints; number++){
- iValue = _mm256_load_ps(iBufferPtr);
- qValue = _mm256_load_ps(qBufferPtr);
+ for (; number < eighthPoints; number++) {
+ iValue = _mm256_load_ps(iBufferPtr);
+ qValue = _mm256_load_ps(qBufferPtr);
- // Interleaves the lower two values in the i and q variables into one buffer
- cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
- cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
+ // Interleaves the lower two values in the i and q variables into one buffer
+ cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
+ cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
- // Interleaves the upper two values in the i and q variables into one buffer
- cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
- cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
+ // Interleaves the upper two values in the i and q variables into one buffer
+ cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
+ cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
- intValue1 = _mm256_cvtps_epi32(cplxValue1);
- intValue2 = _mm256_cvtps_epi32(cplxValue2);
+ intValue1 = _mm256_cvtps_epi32(cplxValue1);
+ intValue2 = _mm256_cvtps_epi32(cplxValue2);
- intValue1 = _mm256_packs_epi32(intValue1, intValue2);
+ intValue1 = _mm256_packs_epi32(intValue1, intValue2);
- _mm256_store_si256((__m256i*)complexVectorPtr, intValue1);
- complexVectorPtr += 16;
+ _mm256_store_si256((__m256i*)complexVectorPtr, intValue1);
+ complexVectorPtr += 16;
- iBufferPtr += 8;
- qBufferPtr += 8;
- }
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
- number = eighthPoints * 8;
- complexVectorPtr = (int16_t*)(&complexVector[number]);
- for(; number < num_points; number++){
- *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
- *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
- }
+ number = eighthPoints * 8;
+ complexVectorPtr = (int16_t*)(&complexVector[number]);
+ for (; number < num_points; number++) {
+ *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
+ *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void
-volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector, const float* iBuffer,
- const float* qBuffer, const float scalar, unsigned int num_points)
+static inline void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector,
+ const float* iBuffer,
+ const float* qBuffer,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const float* iBufferPtr = iBuffer;
- const float* qBufferPtr = qBuffer;
+ unsigned int number = 0;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
- __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 vScalar = _mm_set_ps1(scalar);
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int quarterPoints = num_points / 4;
- __m128 iValue, qValue, cplxValue1, cplxValue2;
- __m128i intValue1, intValue2;
+ __m128 iValue, qValue, cplxValue1, cplxValue2;
+ __m128i intValue1, intValue2;
- int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
- for(;number < quarterPoints; number++){
- iValue = _mm_load_ps(iBufferPtr);
- qValue = _mm_load_ps(qBufferPtr);
+ for (; number < quarterPoints; number++) {
+ iValue = _mm_load_ps(iBufferPtr);
+ qValue = _mm_load_ps(qBufferPtr);
- // Interleaves the lower two values in the i and q variables into one buffer
- cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
- cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
+ // Interleaves the lower two values in the i and q variables into one buffer
+ cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
+ cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
- // Interleaves the upper two values in the i and q variables into one buffer
- cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
- cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
+ // Interleaves the upper two values in the i and q variables into one buffer
+ cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
+ cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
- intValue1 = _mm_cvtps_epi32(cplxValue1);
- intValue2 = _mm_cvtps_epi32(cplxValue2);
+ intValue1 = _mm_cvtps_epi32(cplxValue1);
+ intValue2 = _mm_cvtps_epi32(cplxValue2);
- intValue1 = _mm_packs_epi32(intValue1, intValue2);
+ intValue1 = _mm_packs_epi32(intValue1, intValue2);
- _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
- complexVectorPtr += 8;
+ _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
+ complexVectorPtr += 8;
- iBufferPtr += 4;
- qBufferPtr += 4;
- }
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
- number = quarterPoints * 4;
- complexVectorPtr = (int16_t*)(&complexVector[number]);
- for(; number < num_points; number++){
- *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
- *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
- }
+ number = quarterPoints * 4;
+ complexVectorPtr = (int16_t*)(&complexVector[number]);
+ for (; number < num_points; number++) {
+ *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
+ *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVector, const float* iBuffer,
- const float* qBuffer, const float scalar, unsigned int num_points)
+static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVector,
+ const float* iBuffer,
+ const float* qBuffer,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const float* iBufferPtr = iBuffer;
- const float* qBufferPtr = qBuffer;
+ unsigned int number = 0;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
- __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 vScalar = _mm_set_ps1(scalar);
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int quarterPoints = num_points / 4;
- __m128 iValue, qValue, cplxValue;
+ __m128 iValue, qValue, cplxValue;
- int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
- __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
- for(;number < quarterPoints; number++){
- iValue = _mm_load_ps(iBufferPtr);
- qValue = _mm_load_ps(qBufferPtr);
+ for (; number < quarterPoints; number++) {
+ iValue = _mm_load_ps(iBufferPtr);
+ qValue = _mm_load_ps(qBufferPtr);
- // Interleaves the lower two values in the i and q variables into one buffer
- cplxValue = _mm_unpacklo_ps(iValue, qValue);
- cplxValue = _mm_mul_ps(cplxValue, vScalar);
+ // Interleaves the lower two values in the i and q variables into one buffer
+ cplxValue = _mm_unpacklo_ps(iValue, qValue);
+ cplxValue = _mm_mul_ps(cplxValue, vScalar);
- _mm_store_ps(floatBuffer, cplxValue);
+ _mm_store_ps(floatBuffer, cplxValue);
- *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
- *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
- *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
- *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
+ *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
+ *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
+ *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
+ *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
- // Interleaves the upper two values in the i and q variables into one buffer
- cplxValue = _mm_unpackhi_ps(iValue, qValue);
- cplxValue = _mm_mul_ps(cplxValue, vScalar);
+ // Interleaves the upper two values in the i and q variables into one buffer
+ cplxValue = _mm_unpackhi_ps(iValue, qValue);
+ cplxValue = _mm_mul_ps(cplxValue, vScalar);
- _mm_store_ps(floatBuffer, cplxValue);
+ _mm_store_ps(floatBuffer, cplxValue);
- *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
- *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
- *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
- *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
+ *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
+ *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
+ *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
+ *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
- iBufferPtr += 4;
- qBufferPtr += 4;
- }
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
- number = quarterPoints * 4;
- complexVectorPtr = (int16_t*)(&complexVector[number]);
- for(; number < num_points; number++){
- *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
- *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
- }
+ number = quarterPoints * 4;
+ complexVectorPtr = (int16_t*)(&complexVector[number]);
+ for (; number < num_points; number++) {
+ *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
+ *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector, const float* iBuffer,
- const float* qBuffer, const float scalar, unsigned int num_points)
+static inline void volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector,
+ const float* iBuffer,
+ const float* qBuffer,
+ const float scalar,
+ unsigned int num_points)
{
- int16_t* complexVectorPtr = (int16_t*)complexVector;
- const float* iBufferPtr = iBuffer;
- const float* qBufferPtr = qBuffer;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
- *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
- }
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
+ *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
-#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVector, const float* iBuffer,
- const float* qBuffer, const float scalar, unsigned int num_points)
+static inline void volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVector,
+ const float* iBuffer,
+ const float* qBuffer,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const float* iBufferPtr = iBuffer;
- const float* qBufferPtr = qBuffer;
+ unsigned int number = 0;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
- __m256 vScalar = _mm256_set1_ps(scalar);
+ __m256 vScalar = _mm256_set1_ps(scalar);
- const unsigned int eighthPoints = num_points / 8;
+ const unsigned int eighthPoints = num_points / 8;
- __m256 iValue, qValue, cplxValue1, cplxValue2;
- __m256i intValue1, intValue2;
+ __m256 iValue, qValue, cplxValue1, cplxValue2;
+ __m256i intValue1, intValue2;
- int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
- for(;number < eighthPoints; number++){
- iValue = _mm256_loadu_ps(iBufferPtr);
- qValue = _mm256_loadu_ps(qBufferPtr);
+ for (; number < eighthPoints; number++) {
+ iValue = _mm256_loadu_ps(iBufferPtr);
+ qValue = _mm256_loadu_ps(qBufferPtr);
- // Interleaves the lower two values in the i and q variables into one buffer
- cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
- cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
+ // Interleaves the lower two values in the i and q variables into one buffer
+ cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
+ cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
- // Interleaves the upper two values in the i and q variables into one buffer
- cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
- cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
+ // Interleaves the upper two values in the i and q variables into one buffer
+ cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
+ cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
- intValue1 = _mm256_cvtps_epi32(cplxValue1);
- intValue2 = _mm256_cvtps_epi32(cplxValue2);
+ intValue1 = _mm256_cvtps_epi32(cplxValue1);
+ intValue2 = _mm256_cvtps_epi32(cplxValue2);
- intValue1 = _mm256_packs_epi32(intValue1, intValue2);
+ intValue1 = _mm256_packs_epi32(intValue1, intValue2);
- _mm256_storeu_si256((__m256i*)complexVectorPtr, intValue1);
- complexVectorPtr += 16;
+ _mm256_storeu_si256((__m256i*)complexVectorPtr, intValue1);
+ complexVectorPtr += 16;
- iBufferPtr += 8;
- qBufferPtr += 8;
- }
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
- number = eighthPoints * 8;
- complexVectorPtr = (int16_t*)(&complexVector[number]);
- for(; number < num_points; number++){
- *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
- *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
- }
+ number = eighthPoints * 8;
+ complexVectorPtr = (int16_t*)(&complexVector[number]);
+ for (; number < num_points; number++) {
+ *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
+ *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
+ }
}
#endif /* LV_HAVE_AVX2 */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_x2_subtract_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
- * \endcode
+ * void volk_32f_x2_subtract_32f(float* cVector, const float* aVector, const float*
+ * bVector, unsigned int num_points) \endcode
*
* \b Inputs
* \li aVector: The initial vector.
#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void
-volk_32f_x2_subtract_32f_a_avx512f(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_subtract_32f_a_avx512f(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr = bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m512 aVal, bVal, cVal;
- for(;number < sixteenthPoints; number++){
+ __m512 aVal, bVal, cVal;
+ for (; number < sixteenthPoints; number++) {
- aVal = _mm512_load_ps(aPtr);
- bVal = _mm512_load_ps(bPtr);
+ aVal = _mm512_load_ps(aPtr);
+ bVal = _mm512_load_ps(bPtr);
- cVal = _mm512_sub_ps(aVal, bVal);
+ cVal = _mm512_sub_ps(aVal, bVal);
- _mm512_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 16;
- bPtr += 16;
- cPtr += 16;
- }
+ aPtr += 16;
+ bPtr += 16;
+ cPtr += 16;
+ }
- number = sixteenthPoints *16;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) - (*bPtr++);
- }
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) - (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_x2_subtract_32f_a_avx(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_subtract_32f_a_avx(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr = bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m256 aVal, bVal, cVal;
- for(;number < eighthPoints; number++){
+ __m256 aVal, bVal, cVal;
+ for (; number < eighthPoints; number++) {
- aVal = _mm256_load_ps(aPtr);
- bVal = _mm256_load_ps(bPtr);
+ aVal = _mm256_load_ps(aPtr);
+ bVal = _mm256_load_ps(bPtr);
- cVal = _mm256_sub_ps(aVal, bVal);
+ cVal = _mm256_sub_ps(aVal, bVal);
- _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) - (*bPtr++);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) - (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32f_x2_subtract_32f_a_sse(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_subtract_32f_a_sse(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr = bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m128 aVal, bVal, cVal;
- for(;number < quarterPoints; number++){
+ __m128 aVal, bVal, cVal;
+ for (; number < quarterPoints; number++) {
- aVal = _mm_load_ps(aPtr);
- bVal = _mm_load_ps(bPtr);
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
- cVal = _mm_sub_ps(aVal, bVal);
+ cVal = _mm_sub_ps(aVal, bVal);
- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) - (*bPtr++);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) - (*bPtr++);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_x2_subtract_32f_generic(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_subtract_32f_generic(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr = bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++) - (*bPtr++);
- }
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) - (*bPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_32f_x2_subtract_32f_neon(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_subtract_32f_neon(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr = bVector;
- unsigned int number = 0;
- unsigned int quarter_points = num_points / 4;
-
- float32x4_t a_vec, b_vec, c_vec;
-
- for(number = 0; number < quarter_points; number++){
- a_vec = vld1q_f32(aPtr);
- b_vec = vld1q_f32(bPtr);
- c_vec = vsubq_f32(a_vec, b_vec);
- vst1q_f32(cPtr, c_vec);
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
-
- for(number = quarter_points * 4; number < num_points; number++){
- *cPtr++ = (*aPtr++) - (*bPtr++);
- }
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+
+ float32x4_t a_vec, b_vec, c_vec;
+
+ for (number = 0; number < quarter_points; number++) {
+ a_vec = vld1q_f32(aPtr);
+ b_vec = vld1q_f32(bPtr);
+ c_vec = vsubq_f32(a_vec, b_vec);
+ vst1q_f32(cPtr, c_vec);
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ for (number = quarter_points * 4; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) - (*bPtr++);
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_ORC
-extern void
-volk_32f_x2_subtract_32f_a_orc_impl(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points);
-
-static inline void
-volk_32f_x2_subtract_32f_u_orc(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+extern void volk_32f_x2_subtract_32f_a_orc_impl(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points);
+
+static inline void volk_32f_x2_subtract_32f_u_orc(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+ volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_ORC */
#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void
-volk_32f_x2_subtract_32f_u_avx512f(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_subtract_32f_u_avx512f(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr = bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m512 aVal, bVal, cVal;
- for(;number < sixteenthPoints; number++){
+ __m512 aVal, bVal, cVal;
+ for (; number < sixteenthPoints; number++) {
- aVal = _mm512_loadu_ps(aPtr);
- bVal = _mm512_loadu_ps(bPtr);
+ aVal = _mm512_loadu_ps(aPtr);
+ bVal = _mm512_loadu_ps(bPtr);
- cVal = _mm512_sub_ps(aVal, bVal);
+ cVal = _mm512_sub_ps(aVal, bVal);
- _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container
+ _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 16;
- bPtr += 16;
- cPtr += 16;
- }
+ aPtr += 16;
+ bPtr += 16;
+ cPtr += 16;
+ }
- number = sixteenthPoints *16;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) - (*bPtr++);
- }
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) - (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32f_x2_subtract_32f_u_avx(float* cVector, const float* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32f_x2_subtract_32f_u_avx(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr = bVector;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
- __m256 aVal, bVal, cVal;
- for(;number < eighthPoints; number++){
+ __m256 aVal, bVal, cVal;
+ for (; number < eighthPoints; number++) {
- aVal = _mm256_loadu_ps(aPtr);
- bVal = _mm256_loadu_ps(bPtr);
+ aVal = _mm256_loadu_ps(aPtr);
+ bVal = _mm256_loadu_ps(bPtr);
- cVal = _mm256_sub_ps(aVal, bVal);
+ cVal = _mm256_sub_ps(aVal, bVal);
- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) - (*bPtr++);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) - (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
* multiply by the rectangle/bin width.
*
* Expressed as a formula, this function calculates
- * \f$ \sum f(x) = \sum (c_0 + c_1 \cdot x + c_2 \cdot x^2 + c_3 \cdot x^3 + c_4 \cdot x^4)\f$
+ * \f$ \sum f(x) = \sum (c_0 + c_1 \cdot x + c_2 \cdot x^2 + c_3 \cdot x^3 + c_4 \cdot
+ * x^4)\f$
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_x3_sum_of_poly_32f(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points)
- * \endcode
+ * void volk_32f_x3_sum_of_poly_32f(float* target, float* src0, float* center_point_array,
+ * float* cutoff, unsigned int num_points) \endcode
*
* \b Inputs
* \li src0: x values
* \code
* int npoints = 4096;
* float* coefficients = (float*)volk_malloc(sizeof(float) * 5, volk_get_alignment());
- * float* input = (float*)volk_malloc(sizeof(float) * npoints, volk_get_alignment());
- * float* result = (float*)volk_malloc(sizeof(float), volk_get_alignment());
- * float* cutoff = (float*)volk_malloc(sizeof(float), volk_get_alignment());
+ * float* input = (float*)volk_malloc(sizeof(float) * npoints,
+ * volk_get_alignment()); float* result = (float*)volk_malloc(sizeof(float),
+ * volk_get_alignment()); float* cutoff = (float*)volk_malloc(sizeof(float),
+ * volk_get_alignment());
* // load precomputed Taylor series coefficients
* coefficients[0] = 4.48168907033806f; // c1
* coefficients[1] = coefficients[0] * 0.5f; // c2
#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
#define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
-#include<inttypes.h>
-#include<stdio.h>
-#include<volk/volk_complex.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
#ifndef MAX
-#define MAX(X,Y) ((X) > (Y)?(X):(Y))
+#define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
#endif
#ifdef LV_HAVE_SSE3
-#include<xmmintrin.h>
-#include<pmmintrin.h>
-
-static inline void
-volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array,
- float* cutoff, unsigned int num_points)
+#include <pmmintrin.h>
+#include <xmmintrin.h>
+
+static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target,
+ float* src0,
+ float* center_point_array,
+ float* cutoff,
+ unsigned int num_points)
{
- float result = 0.0f;
- float fst = 0.0f;
- float sq = 0.0f;
- float thrd = 0.0f;
- float frth = 0.0f;
-
- __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
-
- xmm9 = _mm_setzero_ps();
- xmm1 = _mm_setzero_ps();
- xmm0 = _mm_load1_ps(¢er_point_array[0]);
- xmm6 = _mm_load1_ps(¢er_point_array[1]);
- xmm7 = _mm_load1_ps(¢er_point_array[2]);
- xmm8 = _mm_load1_ps(¢er_point_array[3]);
- xmm10 = _mm_load1_ps(cutoff);
-
- int bound = num_points/8;
- int leftovers = num_points - 8*bound;
- int i = 0;
- for(; i < bound; ++i) {
- // 1st
- xmm2 = _mm_load_ps(src0);
- xmm2 = _mm_max_ps(xmm10, xmm2);
- xmm3 = _mm_mul_ps(xmm2, xmm2);
- xmm4 = _mm_mul_ps(xmm2, xmm3);
- xmm5 = _mm_mul_ps(xmm3, xmm3);
-
- xmm2 = _mm_mul_ps(xmm2, xmm0);
- xmm3 = _mm_mul_ps(xmm3, xmm6);
- xmm4 = _mm_mul_ps(xmm4, xmm7);
- xmm5 = _mm_mul_ps(xmm5, xmm8);
-
- xmm2 = _mm_add_ps(xmm2, xmm3);
- xmm3 = _mm_add_ps(xmm4, xmm5);
-
- src0 += 4;
-
- xmm9 = _mm_add_ps(xmm2, xmm9);
- xmm9 = _mm_add_ps(xmm3, xmm9);
-
- // 2nd
- xmm2 = _mm_load_ps(src0);
- xmm2 = _mm_max_ps(xmm10, xmm2);
- xmm3 = _mm_mul_ps(xmm2, xmm2);
- xmm4 = _mm_mul_ps(xmm2, xmm3);
- xmm5 = _mm_mul_ps(xmm3, xmm3);
-
- xmm2 = _mm_mul_ps(xmm2, xmm0);
- xmm3 = _mm_mul_ps(xmm3, xmm6);
- xmm4 = _mm_mul_ps(xmm4, xmm7);
- xmm5 = _mm_mul_ps(xmm5, xmm8);
-
- xmm2 = _mm_add_ps(xmm2, xmm3);
- xmm3 = _mm_add_ps(xmm4, xmm5);
-
- src0 += 4;
-
- xmm1 = _mm_add_ps(xmm2, xmm1);
- xmm1 = _mm_add_ps(xmm3, xmm1);
- }
- xmm2 = _mm_hadd_ps(xmm9, xmm1);
- xmm3 = _mm_hadd_ps(xmm2, xmm2);
- xmm4 = _mm_hadd_ps(xmm3, xmm3);
- _mm_store_ss(&result, xmm4);
-
- for(i = 0; i < leftovers; ++i) {
- fst = *src0++;
- fst = MAX(fst, *cutoff);
- sq = fst * fst;
- thrd = fst * sq;
- frth = sq * sq;
- result += (center_point_array[0] * fst +
- center_point_array[1] * sq +
- center_point_array[2] * thrd +
- center_point_array[3] * frth);
- }
-
- result += (float)(num_points) * center_point_array[4];
- *target = result;
+ float result = 0.0f;
+ float fst = 0.0f;
+ float sq = 0.0f;
+ float thrd = 0.0f;
+ float frth = 0.0f;
+
+ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
+
+ xmm9 = _mm_setzero_ps();
+ xmm1 = _mm_setzero_ps();
+ xmm0 = _mm_load1_ps(¢er_point_array[0]);
+ xmm6 = _mm_load1_ps(¢er_point_array[1]);
+ xmm7 = _mm_load1_ps(¢er_point_array[2]);
+ xmm8 = _mm_load1_ps(¢er_point_array[3]);
+ xmm10 = _mm_load1_ps(cutoff);
+
+ int bound = num_points / 8;
+ int leftovers = num_points - 8 * bound;
+ int i = 0;
+ for (; i < bound; ++i) {
+ // 1st
+ xmm2 = _mm_load_ps(src0);
+ xmm2 = _mm_max_ps(xmm10, xmm2);
+ xmm3 = _mm_mul_ps(xmm2, xmm2);
+ xmm4 = _mm_mul_ps(xmm2, xmm3);
+ xmm5 = _mm_mul_ps(xmm3, xmm3);
+
+ xmm2 = _mm_mul_ps(xmm2, xmm0);
+ xmm3 = _mm_mul_ps(xmm3, xmm6);
+ xmm4 = _mm_mul_ps(xmm4, xmm7);
+ xmm5 = _mm_mul_ps(xmm5, xmm8);
+
+ xmm2 = _mm_add_ps(xmm2, xmm3);
+ xmm3 = _mm_add_ps(xmm4, xmm5);
+
+ src0 += 4;
+
+ xmm9 = _mm_add_ps(xmm2, xmm9);
+ xmm9 = _mm_add_ps(xmm3, xmm9);
+
+ // 2nd
+ xmm2 = _mm_load_ps(src0);
+ xmm2 = _mm_max_ps(xmm10, xmm2);
+ xmm3 = _mm_mul_ps(xmm2, xmm2);
+ xmm4 = _mm_mul_ps(xmm2, xmm3);
+ xmm5 = _mm_mul_ps(xmm3, xmm3);
+
+ xmm2 = _mm_mul_ps(xmm2, xmm0);
+ xmm3 = _mm_mul_ps(xmm3, xmm6);
+ xmm4 = _mm_mul_ps(xmm4, xmm7);
+ xmm5 = _mm_mul_ps(xmm5, xmm8);
+
+ xmm2 = _mm_add_ps(xmm2, xmm3);
+ xmm3 = _mm_add_ps(xmm4, xmm5);
+
+ src0 += 4;
+
+ xmm1 = _mm_add_ps(xmm2, xmm1);
+ xmm1 = _mm_add_ps(xmm3, xmm1);
+ }
+ xmm2 = _mm_hadd_ps(xmm9, xmm1);
+ xmm3 = _mm_hadd_ps(xmm2, xmm2);
+ xmm4 = _mm_hadd_ps(xmm3, xmm3);
+ _mm_store_ss(&result, xmm4);
+
+ for (i = 0; i < leftovers; ++i) {
+ fst = *src0++;
+ fst = MAX(fst, *cutoff);
+ sq = fst * fst;
+ thrd = fst * sq;
+ frth = sq * sq;
+ result += (center_point_array[0] * fst + center_point_array[1] * sq +
+ center_point_array[2] * thrd + center_point_array[3] * frth);
+ }
+
+ result += (float)(num_points)*center_point_array[4];
+ *target = result;
}
#endif /*LV_HAVE_SSE3*/
#if LV_HAVE_AVX && LV_HAVE_FMA
-#include<immintrin.h>
+#include <immintrin.h>
-static inline void
-volk_32f_x3_sum_of_poly_32f_a_avx2_fma(float* target, float* src0, float* center_point_array,
- float* cutoff, unsigned int num_points)
+static inline void volk_32f_x3_sum_of_poly_32f_a_avx2_fma(float* target,
+ float* src0,
+ float* center_point_array,
+ float* cutoff,
+ unsigned int num_points)
{
- const unsigned int eighth_points = num_points / 8;
- float fst = 0.0;
- float sq = 0.0;
- float thrd = 0.0;
- float frth = 0.0;
-
- __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
- __m256 target_vec;
- __m256 x_to_1, x_to_2, x_to_3, x_to_4;
-
- cpa0 = _mm256_set1_ps(center_point_array[0]);
- cpa1 = _mm256_set1_ps(center_point_array[1]);
- cpa2 = _mm256_set1_ps(center_point_array[2]);
- cpa3 = _mm256_set1_ps(center_point_array[3]);
- cutoff_vec = _mm256_set1_ps(*cutoff);
- target_vec = _mm256_setzero_ps();
-
- unsigned int i;
-
- for(i = 0; i < eighth_points; ++i) {
- x_to_1 = _mm256_load_ps(src0);
- x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
- x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
- x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
- // x^1 * x^3 is slightly faster than x^2 * x^2
- x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
-
- x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
- x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
-
- x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
- x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
- // this is slightly faster than result += (x_to_1 + x_to_3)
- target_vec = _mm256_add_ps(x_to_1, target_vec);
- target_vec = _mm256_add_ps(x_to_3, target_vec);
-
- src0 += 8;
- }
-
- // the hadd for vector reduction has very very slight impact @ 50k iters
- __VOLK_ATTR_ALIGNED(32) float temp_results[8];
- target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
- _mm256_store_ps(temp_results, target_vec);
- *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
-
- for(i = eighth_points*8; i < num_points; ++i) {
- fst = *src0++;
- fst = MAX(fst, *cutoff);
- sq = fst * fst;
- thrd = fst * sq;
- frth = sq * sq;
- *target += (center_point_array[0] * fst +
- center_point_array[1] * sq +
- center_point_array[2] * thrd +
- center_point_array[3] * frth);
- }
- *target += (float)(num_points) * center_point_array[4];
+ const unsigned int eighth_points = num_points / 8;
+ float fst = 0.0;
+ float sq = 0.0;
+ float thrd = 0.0;
+ float frth = 0.0;
+
+ __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
+ __m256 target_vec;
+ __m256 x_to_1, x_to_2, x_to_3, x_to_4;
+
+ cpa0 = _mm256_set1_ps(center_point_array[0]);
+ cpa1 = _mm256_set1_ps(center_point_array[1]);
+ cpa2 = _mm256_set1_ps(center_point_array[2]);
+ cpa3 = _mm256_set1_ps(center_point_array[3]);
+ cutoff_vec = _mm256_set1_ps(*cutoff);
+ target_vec = _mm256_setzero_ps();
+
+ unsigned int i;
+
+ for (i = 0; i < eighth_points; ++i) {
+ x_to_1 = _mm256_load_ps(src0);
+ x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
+ x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
+ x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
+ // x^1 * x^3 is slightly faster than x^2 * x^2
+ x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
+
+ x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
+ x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
+
+ x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
+ x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
+ // this is slightly faster than result += (x_to_1 + x_to_3)
+ target_vec = _mm256_add_ps(x_to_1, target_vec);
+ target_vec = _mm256_add_ps(x_to_3, target_vec);
+
+ src0 += 8;
+ }
+
+ // the hadd for vector reduction has very very slight impact @ 50k iters
+ __VOLK_ATTR_ALIGNED(32) float temp_results[8];
+ target_vec = _mm256_hadd_ps(
+ target_vec,
+ target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
+ _mm256_store_ps(temp_results, target_vec);
+ *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
+
+ for (i = eighth_points * 8; i < num_points; ++i) {
+ fst = *src0++;
+ fst = MAX(fst, *cutoff);
+ sq = fst * fst;
+ thrd = fst * sq;
+ frth = sq * sq;
+ *target += (center_point_array[0] * fst + center_point_array[1] * sq +
+ center_point_array[2] * thrd + center_point_array[3] * frth);
+ }
+ *target += (float)(num_points)*center_point_array[4];
}
#endif // LV_HAVE_AVX && LV_HAVE_FMA
#ifdef LV_HAVE_AVX
-#include<immintrin.h>
+#include <immintrin.h>
-static inline void
-volk_32f_x3_sum_of_poly_32f_a_avx(float* target, float* src0, float* center_point_array,
- float* cutoff, unsigned int num_points)
+static inline void volk_32f_x3_sum_of_poly_32f_a_avx(float* target,
+ float* src0,
+ float* center_point_array,
+ float* cutoff,
+ unsigned int num_points)
{
- const unsigned int eighth_points = num_points / 8;
- float fst = 0.0;
- float sq = 0.0;
- float thrd = 0.0;
- float frth = 0.0;
-
- __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
- __m256 target_vec;
- __m256 x_to_1, x_to_2, x_to_3, x_to_4;
-
- cpa0 = _mm256_set1_ps(center_point_array[0]);
- cpa1 = _mm256_set1_ps(center_point_array[1]);
- cpa2 = _mm256_set1_ps(center_point_array[2]);
- cpa3 = _mm256_set1_ps(center_point_array[3]);
- cutoff_vec = _mm256_set1_ps(*cutoff);
- target_vec = _mm256_setzero_ps();
-
- unsigned int i;
-
- for(i = 0; i < eighth_points; ++i) {
- x_to_1 = _mm256_load_ps(src0);
- x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
- x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
- x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
- // x^1 * x^3 is slightly faster than x^2 * x^2
- x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
-
- x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
- x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
- x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
- x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
-
- x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
- x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
- // this is slightly faster than result += (x_to_1 + x_to_3)
- target_vec = _mm256_add_ps(x_to_1, target_vec);
- target_vec = _mm256_add_ps(x_to_3, target_vec);
-
- src0 += 8;
- }
-
- // the hadd for vector reduction has very very slight impact @ 50k iters
- __VOLK_ATTR_ALIGNED(32) float temp_results[8];
- target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
- _mm256_store_ps(temp_results, target_vec);
- *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
-
- for(i = eighth_points*8; i < num_points; ++i) {
- fst = *src0++;
- fst = MAX(fst, *cutoff);
- sq = fst * fst;
- thrd = fst * sq;
- frth = sq * sq;
- *target += (center_point_array[0] * fst +
- center_point_array[1] * sq +
- center_point_array[2] * thrd +
- center_point_array[3] * frth);
- }
- *target += (float)(num_points) * center_point_array[4];
+ const unsigned int eighth_points = num_points / 8;
+ float fst = 0.0;
+ float sq = 0.0;
+ float thrd = 0.0;
+ float frth = 0.0;
+
+ __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
+ __m256 target_vec;
+ __m256 x_to_1, x_to_2, x_to_3, x_to_4;
+
+ cpa0 = _mm256_set1_ps(center_point_array[0]);
+ cpa1 = _mm256_set1_ps(center_point_array[1]);
+ cpa2 = _mm256_set1_ps(center_point_array[2]);
+ cpa3 = _mm256_set1_ps(center_point_array[3]);
+ cutoff_vec = _mm256_set1_ps(*cutoff);
+ target_vec = _mm256_setzero_ps();
+
+ unsigned int i;
+
+ for (i = 0; i < eighth_points; ++i) {
+ x_to_1 = _mm256_load_ps(src0);
+ x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
+ x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
+ x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
+ // x^1 * x^3 is slightly faster than x^2 * x^2
+ x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
+
+ x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
+ x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
+ x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
+ x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
+
+ x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
+ x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
+ // this is slightly faster than result += (x_to_1 + x_to_3)
+ target_vec = _mm256_add_ps(x_to_1, target_vec);
+ target_vec = _mm256_add_ps(x_to_3, target_vec);
+
+ src0 += 8;
+ }
+
+ // the hadd for vector reduction has very very slight impact @ 50k iters
+ __VOLK_ATTR_ALIGNED(32) float temp_results[8];
+ target_vec = _mm256_hadd_ps(
+ target_vec,
+ target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
+ _mm256_store_ps(temp_results, target_vec);
+ *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
+
+ for (i = eighth_points * 8; i < num_points; ++i) {
+ fst = *src0++;
+ fst = MAX(fst, *cutoff);
+ sq = fst * fst;
+ thrd = fst * sq;
+ frth = sq * sq;
+ *target += (center_point_array[0] * fst + center_point_array[1] * sq +
+ center_point_array[2] * thrd + center_point_array[3] * frth);
+ }
+ *target += (float)(num_points)*center_point_array[4];
}
#endif // LV_HAVE_AVX
-
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src0, float* center_point_array,
- float* cutoff, unsigned int num_points)
+static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target,
+ float* src0,
+ float* center_point_array,
+ float* cutoff,
+ unsigned int num_points)
{
- const unsigned int eighth_points = num_points / 8;
-
- float result[8] = {0.0f,0.0f,0.0f,0.0f, 0.0f,0.0f,0.0f,0.0f};
- float fst = 0.0f;
- float sq = 0.0f;
- float thrd = 0.0f;
- float frth = 0.0f;
-
- unsigned int i = 0;
- unsigned int k = 0;
- for(i = 0; i < eighth_points; ++i) {
- for(k = 0; k < 8; ++k) {
- fst = *src0++;
- fst = MAX(fst, *cutoff);
- sq = fst * fst;
- thrd = fst * sq;
- frth = fst * thrd;
- result[k] += center_point_array[0] * fst + center_point_array[1] * sq;
- result[k] += center_point_array[2] * thrd + center_point_array[3] * frth;
+ const unsigned int eighth_points = num_points / 8;
+
+ float result[8] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+ float fst = 0.0f;
+ float sq = 0.0f;
+ float thrd = 0.0f;
+ float frth = 0.0f;
+
+ unsigned int i = 0;
+ unsigned int k = 0;
+ for (i = 0; i < eighth_points; ++i) {
+ for (k = 0; k < 8; ++k) {
+ fst = *src0++;
+ fst = MAX(fst, *cutoff);
+ sq = fst * fst;
+ thrd = fst * sq;
+ frth = fst * thrd;
+ result[k] += center_point_array[0] * fst + center_point_array[1] * sq;
+ result[k] += center_point_array[2] * thrd + center_point_array[3] * frth;
+ }
}
- }
- for(k = 0; k < 8; k+=2)
- result[k] = result[k]+result[k+1];
-
- *target = result[0] + result[2] + result[4] + result[6];
-
- for(i = eighth_points*8; i < num_points; ++i) {
- fst = *src0++;
- fst = MAX(fst, *cutoff);
- sq = fst * fst;
- thrd = fst * sq;
- frth = fst * thrd;
- *target += (center_point_array[0] * fst +
- center_point_array[1] * sq +
- center_point_array[2] * thrd +
- center_point_array[3] * frth);
- }
- *target += (float)(num_points) * center_point_array[4];
+ for (k = 0; k < 8; k += 2)
+ result[k] = result[k] + result[k + 1];
+
+ *target = result[0] + result[2] + result[4] + result[6];
+
+ for (i = eighth_points * 8; i < num_points; ++i) {
+ fst = *src0++;
+ fst = MAX(fst, *cutoff);
+ sq = fst * fst;
+ thrd = fst * sq;
+ frth = fst * thrd;
+ *target += (center_point_array[0] * fst + center_point_array[1] * sq +
+ center_point_array[2] * thrd + center_point_array[3] * frth);
+ }
+ *target += (float)(num_points)*center_point_array[4];
}
#endif /*LV_HAVE_GENERIC*/
#include <arm_neon.h>
static inline void
-volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target, float* __restrict src0,
+volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target,
+ float* __restrict src0,
float* __restrict center_point_array,
- float* __restrict cutoff, unsigned int num_points)
+ float* __restrict cutoff,
+ unsigned int num_points)
{
- unsigned int i;
- float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
-
- float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
- float32x2_t cutoff_vector;
- float32x2x2_t x_low, x_high;
- float32x4_t x_qvector, c_qvector, cpa_qvector;
- float accumulator;
- float res_accumulators[4];
-
- c_qvector = vld1q_f32( zero );
- // load the cutoff in to a vector
- cutoff_vector = vdup_n_f32( *cutoff );
- // ... center point array
- cpa_qvector = vld1q_f32( center_point_array );
-
- for(i=0; i < num_points; ++i) {
- // load x (src0)
- x_to_1 = vdup_n_f32( *src0++ );
-
- // Get a vector of max(src0, cutoff)
- x_to_1 = vmax_f32(x_to_1, cutoff_vector ); // x^1
- x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2
- x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3
- x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4
- // zip up doubles to interleave
- x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1]
- x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3]
- // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
- x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
- // now we finally have [x^4 | x^3 | x^2 | x] !
-
- c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
-
- }
- // there should be better vector reduction techniques
- vst1q_f32(res_accumulators, c_qvector );
- accumulator = res_accumulators[0] + res_accumulators[1] +
- res_accumulators[2] + res_accumulators[3];
-
- *target = accumulator + (float)num_points * center_point_array[4];
+ unsigned int i;
+ float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
+
+ float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
+ float32x2_t cutoff_vector;
+ float32x2x2_t x_low, x_high;
+ float32x4_t x_qvector, c_qvector, cpa_qvector;
+ float accumulator;
+ float res_accumulators[4];
+
+ c_qvector = vld1q_f32(zero);
+ // load the cutoff in to a vector
+ cutoff_vector = vdup_n_f32(*cutoff);
+ // ... center point array
+ cpa_qvector = vld1q_f32(center_point_array);
+
+ for (i = 0; i < num_points; ++i) {
+ // load x (src0)
+ x_to_1 = vdup_n_f32(*src0++);
+
+ // Get a vector of max(src0, cutoff)
+ x_to_1 = vmax_f32(x_to_1, cutoff_vector); // x^1
+ x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2
+ x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3
+ x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4
+ // zip up doubles to interleave
+ x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1]
+ x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3]
+ // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
+ x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
+ // now we finally have [x^4 | x^3 | x^2 | x] !
+
+ c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
+ }
+ // there should be better vector reduction techniques
+ vst1q_f32(res_accumulators, c_qvector);
+ accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
+ res_accumulators[3];
+
+ *target = accumulator + (float)num_points * center_point_array[4];
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_NEON
static inline void
-volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target, float* __restrict src0,
+volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target,
+ float* __restrict src0,
float* __restrict center_point_array,
- float* __restrict cutoff, unsigned int num_points)
+ float* __restrict cutoff,
+ unsigned int num_points)
{
- unsigned int i;
- float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
-
- float accumulator;
-
- float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
- accumulator1_vec = vld1q_f32(zero);
- accumulator2_vec = vld1q_f32(zero);
- accumulator3_vec = vld1q_f32(zero);
- accumulator4_vec = vld1q_f32(zero);
- float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
- float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
-
- // load the cutoff in to a vector
- cutoff_vector = vdupq_n_f32( *cutoff );
- // ... center point array
- cpa_0 = vdupq_n_f32(center_point_array[0]);
- cpa_1 = vdupq_n_f32(center_point_array[1]);
- cpa_2 = vdupq_n_f32(center_point_array[2]);
- cpa_3 = vdupq_n_f32(center_point_array[3]);
-
- // nathan is not sure why this is slower *and* wrong compared to neonvertfma
- for(i=0; i < num_points/4; ++i) {
- // load x
- x_to_1 = vld1q_f32( src0 );
-
- // Get a vector of max(src0, cutoff)
- x_to_1 = vmaxq_f32(x_to_1, cutoff_vector ); // x^1
- x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2
- x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3
- x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4
- x_to_1 = vmulq_f32(x_to_1, cpa_0);
- x_to_2 = vmulq_f32(x_to_2, cpa_1);
- x_to_3 = vmulq_f32(x_to_3, cpa_2);
- x_to_4 = vmulq_f32(x_to_4, cpa_3);
- accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
- accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
- accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
- accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
-
- src0 += 4;
- }
- accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
- accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
- accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
-
- __VOLK_ATTR_ALIGNED(32) float res_accumulators[4];
- vst1q_f32(res_accumulators, accumulator1_vec );
- accumulator = res_accumulators[0] + res_accumulators[1] +
- res_accumulators[2] + res_accumulators[3];
-
- float fst = 0.0;
- float sq = 0.0;
- float thrd = 0.0;
- float frth = 0.0;
-
- for(i = 4*num_points/4; i < num_points; ++i) {
- fst = src0[i];
- fst = MAX(fst, *cutoff);
-
- sq = fst * fst;
- thrd = fst * sq;
- frth = sq * sq;
- //fith = sq * thrd;
-
- accumulator += (center_point_array[0] * fst +
- center_point_array[1] * sq +
- center_point_array[2] * thrd +
- center_point_array[3] * frth); //+
- }
-
- *target = accumulator + (float)num_points * center_point_array[4];
+ unsigned int i;
+ float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
+
+ float accumulator;
+
+ float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
+ accumulator1_vec = vld1q_f32(zero);
+ accumulator2_vec = vld1q_f32(zero);
+ accumulator3_vec = vld1q_f32(zero);
+ accumulator4_vec = vld1q_f32(zero);
+ float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
+ float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
+
+ // load the cutoff in to a vector
+ cutoff_vector = vdupq_n_f32(*cutoff);
+ // ... center point array
+ cpa_0 = vdupq_n_f32(center_point_array[0]);
+ cpa_1 = vdupq_n_f32(center_point_array[1]);
+ cpa_2 = vdupq_n_f32(center_point_array[2]);
+ cpa_3 = vdupq_n_f32(center_point_array[3]);
+
+ // nathan is not sure why this is slower *and* wrong compared to neonvertfma
+ for (i = 0; i < num_points / 4; ++i) {
+ // load x
+ x_to_1 = vld1q_f32(src0);
+
+ // Get a vector of max(src0, cutoff)
+ x_to_1 = vmaxq_f32(x_to_1, cutoff_vector); // x^1
+ x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2
+ x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3
+ x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4
+ x_to_1 = vmulq_f32(x_to_1, cpa_0);
+ x_to_2 = vmulq_f32(x_to_2, cpa_1);
+ x_to_3 = vmulq_f32(x_to_3, cpa_2);
+ x_to_4 = vmulq_f32(x_to_4, cpa_3);
+ accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
+ accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
+ accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
+ accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
+
+ src0 += 4;
+ }
+ accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
+ accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
+ accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
+
+ __VOLK_ATTR_ALIGNED(32) float res_accumulators[4];
+ vst1q_f32(res_accumulators, accumulator1_vec);
+ accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
+ res_accumulators[3];
+
+ float fst = 0.0;
+ float sq = 0.0;
+ float thrd = 0.0;
+ float frth = 0.0;
+
+ for (i = 4 * num_points / 4; i < num_points; ++i) {
+ fst = src0[i];
+ fst = MAX(fst, *cutoff);
+
+ sq = fst * fst;
+ thrd = fst * sq;
+ frth = sq * sq;
+ // fith = sq * thrd;
+
+ accumulator += (center_point_array[0] * fst + center_point_array[1] * sq +
+ center_point_array[2] * thrd + center_point_array[3] * frth); //+
+ }
+
+ *target = accumulator + (float)num_points * center_point_array[4];
}
#endif /* LV_HAVE_NEON */
#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
#define INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
-#include<inttypes.h>
-#include<stdio.h>
-#include<volk/volk_complex.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
#ifndef MAX
-#define MAX(X,Y) ((X) > (Y)?(X):(Y))
+#define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
#endif
#if LV_HAVE_AVX && LV_HAVE_FMA
-#include<immintrin.h>
+#include <immintrin.h>
-static inline void
-volk_32f_x3_sum_of_poly_32f_u_avx_fma(float* target, float* src0, float* center_point_array,
- float* cutoff, unsigned int num_points)
+static inline void volk_32f_x3_sum_of_poly_32f_u_avx_fma(float* target,
+ float* src0,
+ float* center_point_array,
+ float* cutoff,
+ unsigned int num_points)
{
- const unsigned int eighth_points = num_points / 8;
- float fst = 0.0;
- float sq = 0.0;
- float thrd = 0.0;
- float frth = 0.0;
-
- __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
- __m256 target_vec;
- __m256 x_to_1, x_to_2, x_to_3, x_to_4;
-
- cpa0 = _mm256_set1_ps(center_point_array[0]);
- cpa1 = _mm256_set1_ps(center_point_array[1]);
- cpa2 = _mm256_set1_ps(center_point_array[2]);
- cpa3 = _mm256_set1_ps(center_point_array[3]);
- cutoff_vec = _mm256_set1_ps(*cutoff);
- target_vec = _mm256_setzero_ps();
-
- unsigned int i;
-
- for(i = 0; i < eighth_points; ++i) {
- x_to_1 = _mm256_loadu_ps(src0);
- x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
- x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
- x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
- // x^1 * x^3 is slightly faster than x^2 * x^2
- x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
-
- x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
- x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
-
- x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
- x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
- // this is slightly faster than result += (x_to_1 + x_to_3)
- target_vec = _mm256_add_ps(x_to_1, target_vec);
- target_vec = _mm256_add_ps(x_to_3, target_vec);
-
- src0 += 8;
- }
-
- // the hadd for vector reduction has very very slight impact @ 50k iters
- __VOLK_ATTR_ALIGNED(32) float temp_results[8];
- target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
- _mm256_storeu_ps(temp_results, target_vec);
- *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
-
- for(i = eighth_points*8; i < num_points; ++i) {
- fst = *src0++;
- fst = MAX(fst, *cutoff);
- sq = fst * fst;
- thrd = fst * sq;
- frth = sq * sq;
- *target += (center_point_array[0] * fst +
- center_point_array[1] * sq +
- center_point_array[2] * thrd +
- center_point_array[3] * frth);
- }
-
- *target += (float)(num_points) * center_point_array[4];
+ const unsigned int eighth_points = num_points / 8;
+ float fst = 0.0;
+ float sq = 0.0;
+ float thrd = 0.0;
+ float frth = 0.0;
+
+ __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
+ __m256 target_vec;
+ __m256 x_to_1, x_to_2, x_to_3, x_to_4;
+
+ cpa0 = _mm256_set1_ps(center_point_array[0]);
+ cpa1 = _mm256_set1_ps(center_point_array[1]);
+ cpa2 = _mm256_set1_ps(center_point_array[2]);
+ cpa3 = _mm256_set1_ps(center_point_array[3]);
+ cutoff_vec = _mm256_set1_ps(*cutoff);
+ target_vec = _mm256_setzero_ps();
+
+ unsigned int i;
+
+ for (i = 0; i < eighth_points; ++i) {
+ x_to_1 = _mm256_loadu_ps(src0);
+ x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
+ x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
+ x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
+ // x^1 * x^3 is slightly faster than x^2 * x^2
+ x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
+
+ x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
+ x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
+
+ x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
+ x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
+ // this is slightly faster than result += (x_to_1 + x_to_3)
+ target_vec = _mm256_add_ps(x_to_1, target_vec);
+ target_vec = _mm256_add_ps(x_to_3, target_vec);
+
+ src0 += 8;
+ }
+
+ // the hadd for vector reduction has very very slight impact @ 50k iters
+ __VOLK_ATTR_ALIGNED(32) float temp_results[8];
+ target_vec = _mm256_hadd_ps(
+ target_vec,
+ target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
+ _mm256_storeu_ps(temp_results, target_vec);
+ *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
+
+ for (i = eighth_points * 8; i < num_points; ++i) {
+ fst = *src0++;
+ fst = MAX(fst, *cutoff);
+ sq = fst * fst;
+ thrd = fst * sq;
+ frth = sq * sq;
+ *target += (center_point_array[0] * fst + center_point_array[1] * sq +
+ center_point_array[2] * thrd + center_point_array[3] * frth);
+ }
+
+ *target += (float)(num_points)*center_point_array[4];
}
#endif // LV_HAVE_AVX && LV_HAVE_FMA
#ifdef LV_HAVE_AVX
-#include<immintrin.h>
+#include <immintrin.h>
-static inline void
-volk_32f_x3_sum_of_poly_32f_u_avx(float* target, float* src0, float* center_point_array,
- float* cutoff, unsigned int num_points)
+static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target,
+ float* src0,
+ float* center_point_array,
+ float* cutoff,
+ unsigned int num_points)
{
- const unsigned int eighth_points = num_points / 8;
- float fst = 0.0;
- float sq = 0.0;
- float thrd = 0.0;
- float frth = 0.0;
-
- __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
- __m256 target_vec;
- __m256 x_to_1, x_to_2, x_to_3, x_to_4;
-
- cpa0 = _mm256_set1_ps(center_point_array[0]);
- cpa1 = _mm256_set1_ps(center_point_array[1]);
- cpa2 = _mm256_set1_ps(center_point_array[2]);
- cpa3 = _mm256_set1_ps(center_point_array[3]);
- cutoff_vec = _mm256_set1_ps(*cutoff);
- target_vec = _mm256_setzero_ps();
-
- unsigned int i;
-
- for(i = 0; i < eighth_points; ++i) {
- x_to_1 = _mm256_loadu_ps(src0);
- x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
- x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
- x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
- // x^1 * x^3 is slightly faster than x^2 * x^2
- x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
-
- x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
- x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
- x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
- x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
-
- x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
- x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
- // this is slightly faster than result += (x_to_1 + x_to_3)
- target_vec = _mm256_add_ps(x_to_1, target_vec);
- target_vec = _mm256_add_ps(x_to_3, target_vec);
-
- src0 += 8;
- }
-
- // the hadd for vector reduction has very very slight impact @ 50k iters
- __VOLK_ATTR_ALIGNED(32) float temp_results[8];
- target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
- _mm256_storeu_ps(temp_results, target_vec);
- *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
-
- for(i = eighth_points*8; i < num_points; ++i) {
- fst = *src0++;
- fst = MAX(fst, *cutoff);
- sq = fst * fst;
- thrd = fst * sq;
- frth = sq * sq;
-
- *target += (center_point_array[0] * fst +
- center_point_array[1] * sq +
- center_point_array[2] * thrd +
- center_point_array[3] * frth);
- }
-
- *target += (float)(num_points) * center_point_array[4];
+ const unsigned int eighth_points = num_points / 8;
+ float fst = 0.0;
+ float sq = 0.0;
+ float thrd = 0.0;
+ float frth = 0.0;
+
+ __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
+ __m256 target_vec;
+ __m256 x_to_1, x_to_2, x_to_3, x_to_4;
+
+ cpa0 = _mm256_set1_ps(center_point_array[0]);
+ cpa1 = _mm256_set1_ps(center_point_array[1]);
+ cpa2 = _mm256_set1_ps(center_point_array[2]);
+ cpa3 = _mm256_set1_ps(center_point_array[3]);
+ cutoff_vec = _mm256_set1_ps(*cutoff);
+ target_vec = _mm256_setzero_ps();
+
+ unsigned int i;
+
+ for (i = 0; i < eighth_points; ++i) {
+ x_to_1 = _mm256_loadu_ps(src0);
+ x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
+ x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
+ x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
+ // x^1 * x^3 is slightly faster than x^2 * x^2
+ x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
+
+ x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
+ x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
+ x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
+ x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
+
+ x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
+ x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
+ // this is slightly faster than result += (x_to_1 + x_to_3)
+ target_vec = _mm256_add_ps(x_to_1, target_vec);
+ target_vec = _mm256_add_ps(x_to_3, target_vec);
+
+ src0 += 8;
+ }
+
+ // the hadd for vector reduction has very very slight impact @ 50k iters
+ __VOLK_ATTR_ALIGNED(32) float temp_results[8];
+ target_vec = _mm256_hadd_ps(
+ target_vec,
+ target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
+ _mm256_storeu_ps(temp_results, target_vec);
+ *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
+
+ for (i = eighth_points * 8; i < num_points; ++i) {
+ fst = *src0++;
+ fst = MAX(fst, *cutoff);
+ sq = fst * fst;
+ thrd = fst * sq;
+ frth = sq * sq;
+
+ *target += (center_point_array[0] * fst + center_point_array[1] * sq +
+ center_point_array[2] * thrd + center_point_array[3] * frth);
+ }
+
+ *target += (float)(num_points)*center_point_array[4];
}
#endif // LV_HAVE_AVX
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_32f_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points)
- * \endcode
+ * void volk_32fc_32f_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float*
+ * bVector, unsigned int num_points) \endcode
*
* \b Inputs
* \li aVector: First vector of input points.
*
* \b Example
*
- * The follow example adds the increasing and decreasing vectors such that the result of every summation pair is 10
+ * The follow example adds the increasing and decreasing vectors such that the result of
+ * every summation pair is 10
*
* \code
* int N = 10;
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32fc_32f_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32fc_32f_add_32fc_generic(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- const float* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const float* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32fc_32f_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32fc_32f_add_32fc_u_avx(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- const float* bPtr= bVector;
-
- __m256 aVal1, aVal2, bVal, cVal1, cVal2;
- __m256 cpx_b1, cpx_b2;
- __m256 zero;
- zero = _mm256_setzero_ps();
- __m256 tmp1, tmp2;
- for(;number < eighthPoints; number++){
-
- aVal1 = _mm256_loadu_ps((float *) aPtr);
- aVal2 = _mm256_loadu_ps((float *) (aPtr+4));
- bVal = _mm256_loadu_ps(bPtr);
- cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
- cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
-
- tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0+(0x2<<4));
- tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1+(0x3<<4));
-
- cVal1 = _mm256_add_ps(aVal1, tmp1);
- cVal2 = _mm256_add_ps(aVal2, tmp2);
-
- _mm256_storeu_ps((float *) cPtr, cVal1); // Store the results back into the C container
- _mm256_storeu_ps((float *) (cPtr+4), cVal2); // Store the results back into the C container
-
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const float* bPtr = bVector;
+
+ __m256 aVal1, aVal2, bVal, cVal1, cVal2;
+ __m256 cpx_b1, cpx_b2;
+ __m256 zero;
+ zero = _mm256_setzero_ps();
+ __m256 tmp1, tmp2;
+ for (; number < eighthPoints; number++) {
+
+ aVal1 = _mm256_loadu_ps((float*)aPtr);
+ aVal2 = _mm256_loadu_ps((float*)(aPtr + 4));
+ bVal = _mm256_loadu_ps(bPtr);
+ cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
+ cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
+
+ tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0 + (0x2 << 4));
+ tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1 + (0x3 << 4));
+
+ cVal1 = _mm256_add_ps(aVal1, tmp1);
+ cVal2 = _mm256_add_ps(aVal2, tmp2);
+
+ _mm256_storeu_ps((float*)cPtr,
+ cVal1); // Store the results back into the C container
+ _mm256_storeu_ps((float*)(cPtr + 4),
+ cVal2); // Store the results back into the C container
+
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32fc_32f_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32fc_32f_add_32fc_a_avx(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- const float* bPtr= bVector;
-
- __m256 aVal1, aVal2, bVal, cVal1, cVal2;
- __m256 cpx_b1, cpx_b2;
- __m256 zero;
- zero = _mm256_setzero_ps();
- __m256 tmp1, tmp2;
- for(;number < eighthPoints; number++){
-
- aVal1 = _mm256_load_ps((float *) aPtr);
- aVal2 = _mm256_load_ps((float *) (aPtr+4));
- bVal = _mm256_load_ps(bPtr);
- cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
- cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
-
- tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0+(0x2<<4));
- tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1+(0x3<<4));
-
- cVal1 = _mm256_add_ps(aVal1, tmp1);
- cVal2 = _mm256_add_ps(aVal2, tmp2);
-
- _mm256_store_ps((float *) cPtr, cVal1); // Store the results back into the C container
- _mm256_store_ps((float *) (cPtr+4), cVal2); // Store the results back into the C container
-
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const float* bPtr = bVector;
+
+ __m256 aVal1, aVal2, bVal, cVal1, cVal2;
+ __m256 cpx_b1, cpx_b2;
+ __m256 zero;
+ zero = _mm256_setzero_ps();
+ __m256 tmp1, tmp2;
+ for (; number < eighthPoints; number++) {
+
+ aVal1 = _mm256_load_ps((float*)aPtr);
+ aVal2 = _mm256_load_ps((float*)(aPtr + 4));
+ bVal = _mm256_load_ps(bPtr);
+ cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
+ cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
+
+ tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0 + (0x2 << 4));
+ tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1 + (0x3 << 4));
+
+ cVal1 = _mm256_add_ps(aVal1, tmp1);
+ cVal2 = _mm256_add_ps(aVal2, tmp2);
+
+ _mm256_store_ps((float*)cPtr,
+ cVal1); // Store the results back into the C container
+ _mm256_store_ps((float*)(cPtr + 4),
+ cVal2); // Store the results back into the C container
+
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_32fc_32f_add_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32fc_32f_add_32fc_neon(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- const float* bPtr = bVector;
-
- float32x4x4_t aVal0, aVal1;
- float32x4x2_t bVal0, bVal1;
-
- const unsigned int sixteenthPoints = num_points / 16;
- unsigned int number = 0;
- for(; number < sixteenthPoints; number++){
- aVal0 = vld4q_f32((const float*)aPtr);
- aPtr += 8;
- aVal1 = vld4q_f32((const float*)aPtr);
- aPtr += 8;
- __VOLK_PREFETCH(aPtr+16);
-
- bVal0 = vld2q_f32((const float*)bPtr);
- bPtr += 8;
- bVal1 = vld2q_f32((const float*)bPtr);
- bPtr += 8;
- __VOLK_PREFETCH(bPtr+16);
-
- aVal0.val[0] = vaddq_f32(aVal0.val[0], bVal0.val[0]);
- aVal0.val[2] = vaddq_f32(aVal0.val[2], bVal0.val[1]);
-
- aVal1.val[2] = vaddq_f32(aVal1.val[2], bVal1.val[1]);
- aVal1.val[0] = vaddq_f32(aVal1.val[0], bVal1.val[0]);
-
- vst4q_f32((float*)(cPtr), aVal0);
- cPtr += 8;
- vst4q_f32((float*)(cPtr), aVal1);
- cPtr += 8;
- }
-
- for(number = sixteenthPoints * 16; number < num_points; number++){
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const float* bPtr = bVector;
+
+ float32x4x4_t aVal0, aVal1;
+ float32x4x2_t bVal0, bVal1;
+
+ const unsigned int sixteenthPoints = num_points / 16;
+ unsigned int number = 0;
+ for (; number < sixteenthPoints; number++) {
+ aVal0 = vld4q_f32((const float*)aPtr);
+ aPtr += 8;
+ aVal1 = vld4q_f32((const float*)aPtr);
+ aPtr += 8;
+ __VOLK_PREFETCH(aPtr + 16);
+
+ bVal0 = vld2q_f32((const float*)bPtr);
+ bPtr += 8;
+ bVal1 = vld2q_f32((const float*)bPtr);
+ bPtr += 8;
+ __VOLK_PREFETCH(bPtr + 16);
+
+ aVal0.val[0] = vaddq_f32(aVal0.val[0], bVal0.val[0]);
+ aVal0.val[2] = vaddq_f32(aVal0.val[2], bVal0.val[1]);
+
+ aVal1.val[2] = vaddq_f32(aVal1.val[2], bVal1.val[1]);
+ aVal1.val[0] = vaddq_f32(aVal1.val[0], bVal1.val[0]);
+
+ vst4q_f32((float*)(cPtr), aVal0);
+ cPtr += 8;
+ vst4q_f32((float*)(cPtr), aVal1);
+ cPtr += 8;
+ }
+
+ for (number = sixteenthPoints * 16; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_NEON */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_32f_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points)
- * \endcode
+ * void volk_32fc_32f_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const float
+ * * taps, unsigned int num_points) \endcode
*
* \b Inputs
* \li input: vector of complex samples
#ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
#define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
-#include <volk/volk_common.h>
#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_GENERIC
-static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) {
+static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const float* taps,
+ unsigned int num_points)
+{
- float res[2];
- float *realpt = &res[0], *imagpt = &res[1];
- const float* aPtr = (float*)input;
- const float* bPtr= taps;
- unsigned int number = 0;
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* aPtr = (float*)input;
+ const float* bPtr = taps;
+ unsigned int number = 0;
- *realpt = 0;
- *imagpt = 0;
+ *realpt = 0;
+ *imagpt = 0;
- for(number = 0; number < num_points; number++){
- *realpt += ((*aPtr++) * (*bPtr));
- *imagpt += ((*aPtr++) * (*bPtr++));
- }
+ for (number = 0; number < num_points; number++) {
+ *realpt += ((*aPtr++) * (*bPtr));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
- *result = *(lv_32fc_t*)(&res[0]);
+ *result = *(lv_32fc_t*)(&res[0]);
}
#endif /*LV_HAVE_GENERIC*/
#include <immintrin.h>
-static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float res[2];
- float *realpt = &res[0], *imagpt = &res[1];
- const float* aPtr = (float*)input;
- const float* bPtr = taps;
-
- __m256 a0Val, a1Val, a2Val, a3Val;
- __m256 b0Val, b1Val, b2Val, b3Val;
- __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
-
- __m256 dotProdVal0 = _mm256_setzero_ps();
- __m256 dotProdVal1 = _mm256_setzero_ps();
- __m256 dotProdVal2 = _mm256_setzero_ps();
- __m256 dotProdVal3 = _mm256_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- a0Val = _mm256_load_ps(aPtr);
- a1Val = _mm256_load_ps(aPtr+8);
- a2Val = _mm256_load_ps(aPtr+16);
- a3Val = _mm256_load_ps(aPtr+24);
-
- x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
- x1Val = _mm256_load_ps(bPtr+8);
- x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
- x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
- x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
- x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
-
- // TODO: it may be possible to rearrange swizzling to better pipeline data
- b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
- b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
- b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
- b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
-
- dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
- dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
- dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
- dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
-
- aPtr += 32;
- bPtr += 16;
- }
-
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- *realpt = dotProductVector[0];
- *imagpt = dotProductVector[1];
- *realpt += dotProductVector[2];
- *imagpt += dotProductVector[3];
- *realpt += dotProductVector[4];
- *imagpt += dotProductVector[5];
- *realpt += dotProductVector[6];
- *imagpt += dotProductVector[7];
-
- number = sixteenthPoints*16;
- for(;number < num_points; number++){
- *realpt += ((*aPtr++) * (*bPtr));
- *imagpt += ((*aPtr++) * (*bPtr++));
- }
-
- *result = *(lv_32fc_t*)(&res[0]);
+static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const float* taps,
+ unsigned int num_points)
+{
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* aPtr = (float*)input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val, a2Val, a3Val;
+ __m256 b0Val, b1Val, b2Val, b3Val;
+ __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+ __m256 dotProdVal2 = _mm256_setzero_ps();
+ __m256 dotProdVal3 = _mm256_setzero_ps();
+
+ for (; number < sixteenthPoints; number++) {
+
+ a0Val = _mm256_load_ps(aPtr);
+ a1Val = _mm256_load_ps(aPtr + 8);
+ a2Val = _mm256_load_ps(aPtr + 16);
+ a3Val = _mm256_load_ps(aPtr + 24);
+
+ x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
+ x1Val = _mm256_load_ps(bPtr + 8);
+ x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
+ x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
+ x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
+ x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
+
+ // TODO: it may be possible to rearrange swizzling to better pipeline data
+ b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
+ b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
+ b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
+ b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
+
+ dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
+ dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
+ dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
+ dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
+
+ aPtr += 32;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_store_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+ *realpt += dotProductVector[4];
+ *imagpt += dotProductVector[5];
+ *realpt += dotProductVector[6];
+ *imagpt += dotProductVector[7];
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *realpt += ((*aPtr++) * (*bPtr));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
}
#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
#include <immintrin.h>
-static inline void volk_32fc_32f_dot_prod_32fc_a_avx( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float res[2];
- float *realpt = &res[0], *imagpt = &res[1];
- const float* aPtr = (float*)input;
- const float* bPtr = taps;
-
- __m256 a0Val, a1Val, a2Val, a3Val;
- __m256 b0Val, b1Val, b2Val, b3Val;
- __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
- __m256 c0Val, c1Val, c2Val, c3Val;
-
- __m256 dotProdVal0 = _mm256_setzero_ps();
- __m256 dotProdVal1 = _mm256_setzero_ps();
- __m256 dotProdVal2 = _mm256_setzero_ps();
- __m256 dotProdVal3 = _mm256_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- a0Val = _mm256_load_ps(aPtr);
- a1Val = _mm256_load_ps(aPtr+8);
- a2Val = _mm256_load_ps(aPtr+16);
- a3Val = _mm256_load_ps(aPtr+24);
-
- x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
- x1Val = _mm256_load_ps(bPtr+8);
- x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
- x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
- x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
- x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
-
- // TODO: it may be possible to rearrange swizzling to better pipeline data
- b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
- b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
- b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
- b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
-
- c0Val = _mm256_mul_ps(a0Val, b0Val);
- c1Val = _mm256_mul_ps(a1Val, b1Val);
- c2Val = _mm256_mul_ps(a2Val, b2Val);
- c3Val = _mm256_mul_ps(a3Val, b3Val);
-
- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
- dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
- dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
-
- aPtr += 32;
- bPtr += 16;
- }
-
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- *realpt = dotProductVector[0];
- *imagpt = dotProductVector[1];
- *realpt += dotProductVector[2];
- *imagpt += dotProductVector[3];
- *realpt += dotProductVector[4];
- *imagpt += dotProductVector[5];
- *realpt += dotProductVector[6];
- *imagpt += dotProductVector[7];
-
- number = sixteenthPoints*16;
- for(;number < num_points; number++){
- *realpt += ((*aPtr++) * (*bPtr));
- *imagpt += ((*aPtr++) * (*bPtr++));
- }
-
- *result = *(lv_32fc_t*)(&res[0]);
+static inline void volk_32fc_32f_dot_prod_32fc_a_avx(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const float* taps,
+ unsigned int num_points)
+{
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* aPtr = (float*)input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val, a2Val, a3Val;
+ __m256 b0Val, b1Val, b2Val, b3Val;
+ __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
+ __m256 c0Val, c1Val, c2Val, c3Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+ __m256 dotProdVal2 = _mm256_setzero_ps();
+ __m256 dotProdVal3 = _mm256_setzero_ps();
+
+ for (; number < sixteenthPoints; number++) {
+
+ a0Val = _mm256_load_ps(aPtr);
+ a1Val = _mm256_load_ps(aPtr + 8);
+ a2Val = _mm256_load_ps(aPtr + 16);
+ a3Val = _mm256_load_ps(aPtr + 24);
+
+ x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
+ x1Val = _mm256_load_ps(bPtr + 8);
+ x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
+ x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
+ x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
+ x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
+
+ // TODO: it may be possible to rearrange swizzling to better pipeline data
+ b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
+ b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
+ b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
+ b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
+
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
+ c2Val = _mm256_mul_ps(a2Val, b2Val);
+ c3Val = _mm256_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 32;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_store_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+ *realpt += dotProductVector[4];
+ *imagpt += dotProductVector[5];
+ *realpt += dotProductVector[6];
+ *imagpt += dotProductVector[7];
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *realpt += ((*aPtr++) * (*bPtr));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
}
#endif /*LV_HAVE_AVX*/
-
-
#ifdef LV_HAVE_SSE
-static inline void volk_32fc_32f_dot_prod_32fc_a_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 8;
-
- float res[2];
- float *realpt = &res[0], *imagpt = &res[1];
- const float* aPtr = (float*)input;
- const float* bPtr = taps;
-
- __m128 a0Val, a1Val, a2Val, a3Val;
- __m128 b0Val, b1Val, b2Val, b3Val;
- __m128 x0Val, x1Val, x2Val, x3Val;
- __m128 c0Val, c1Val, c2Val, c3Val;
-
- __m128 dotProdVal0 = _mm_setzero_ps();
- __m128 dotProdVal1 = _mm_setzero_ps();
- __m128 dotProdVal2 = _mm_setzero_ps();
- __m128 dotProdVal3 = _mm_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- a0Val = _mm_load_ps(aPtr);
- a1Val = _mm_load_ps(aPtr+4);
- a2Val = _mm_load_ps(aPtr+8);
- a3Val = _mm_load_ps(aPtr+12);
-
- x0Val = _mm_load_ps(bPtr);
- x1Val = _mm_load_ps(bPtr);
- x2Val = _mm_load_ps(bPtr+4);
- x3Val = _mm_load_ps(bPtr+4);
- b0Val = _mm_unpacklo_ps(x0Val, x1Val);
- b1Val = _mm_unpackhi_ps(x0Val, x1Val);
- b2Val = _mm_unpacklo_ps(x2Val, x3Val);
- b3Val = _mm_unpackhi_ps(x2Val, x3Val);
-
- c0Val = _mm_mul_ps(a0Val, b0Val);
- c1Val = _mm_mul_ps(a1Val, b1Val);
- c2Val = _mm_mul_ps(a2Val, b2Val);
- c3Val = _mm_mul_ps(a3Val, b3Val);
-
- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
-
- aPtr += 16;
- bPtr += 8;
- }
-
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-
- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- *realpt = dotProductVector[0];
- *imagpt = dotProductVector[1];
- *realpt += dotProductVector[2];
- *imagpt += dotProductVector[3];
-
- number = sixteenthPoints*8;
- for(;number < num_points; number++){
- *realpt += ((*aPtr++) * (*bPtr));
- *imagpt += ((*aPtr++) * (*bPtr++));
- }
-
- *result = *(lv_32fc_t*)(&res[0]);
+static inline void volk_32fc_32f_dot_prod_32fc_a_sse(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const float* taps,
+ unsigned int num_points)
+{
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 8;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* aPtr = (float*)input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 x0Val, x1Val, x2Val, x3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for (; number < sixteenthPoints; number++) {
+
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr + 4);
+ a2Val = _mm_load_ps(aPtr + 8);
+ a3Val = _mm_load_ps(aPtr + 12);
+
+ x0Val = _mm_load_ps(bPtr);
+ x1Val = _mm_load_ps(bPtr);
+ x2Val = _mm_load_ps(bPtr + 4);
+ x3Val = _mm_load_ps(bPtr + 4);
+ b0Val = _mm_unpacklo_ps(x0Val, x1Val);
+ b1Val = _mm_unpackhi_ps(x0Val, x1Val);
+ b2Val = _mm_unpacklo_ps(x2Val, x3Val);
+ b3Val = _mm_unpackhi_ps(x2Val, x3Val);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 8;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+
+ number = sixteenthPoints * 8;
+ for (; number < num_points; number++) {
+ *realpt += ((*aPtr++) * (*bPtr));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
}
#endif /*LV_HAVE_SSE*/
#include <immintrin.h>
-static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float res[2];
- float *realpt = &res[0], *imagpt = &res[1];
- const float* aPtr = (float*)input;
- const float* bPtr = taps;
-
- __m256 a0Val, a1Val, a2Val, a3Val;
- __m256 b0Val, b1Val, b2Val, b3Val;
- __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
-
- __m256 dotProdVal0 = _mm256_setzero_ps();
- __m256 dotProdVal1 = _mm256_setzero_ps();
- __m256 dotProdVal2 = _mm256_setzero_ps();
- __m256 dotProdVal3 = _mm256_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- a0Val = _mm256_loadu_ps(aPtr);
- a1Val = _mm256_loadu_ps(aPtr+8);
- a2Val = _mm256_loadu_ps(aPtr+16);
- a3Val = _mm256_loadu_ps(aPtr+24);
-
- x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
- x1Val = _mm256_load_ps(bPtr+8);
- x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
- x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
- x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
- x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
-
- // TODO: it may be possible to rearrange swizzling to better pipeline data
- b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
- b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
- b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
- b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
-
- dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
- dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
- dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
- dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
-
- aPtr += 32;
- bPtr += 16;
- }
-
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- *realpt = dotProductVector[0];
- *imagpt = dotProductVector[1];
- *realpt += dotProductVector[2];
- *imagpt += dotProductVector[3];
- *realpt += dotProductVector[4];
- *imagpt += dotProductVector[5];
- *realpt += dotProductVector[6];
- *imagpt += dotProductVector[7];
-
- number = sixteenthPoints*16;
- for(;number < num_points; number++){
- *realpt += ((*aPtr++) * (*bPtr));
- *imagpt += ((*aPtr++) * (*bPtr++));
- }
-
- *result = *(lv_32fc_t*)(&res[0]);
+static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const float* taps,
+ unsigned int num_points)
+{
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* aPtr = (float*)input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val, a2Val, a3Val;
+ __m256 b0Val, b1Val, b2Val, b3Val;
+ __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+ __m256 dotProdVal2 = _mm256_setzero_ps();
+ __m256 dotProdVal3 = _mm256_setzero_ps();
+
+ for (; number < sixteenthPoints; number++) {
+
+ a0Val = _mm256_loadu_ps(aPtr);
+ a1Val = _mm256_loadu_ps(aPtr + 8);
+ a2Val = _mm256_loadu_ps(aPtr + 16);
+ a3Val = _mm256_loadu_ps(aPtr + 24);
+
+ x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
+ x1Val = _mm256_load_ps(bPtr + 8);
+ x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
+ x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
+ x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
+ x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
+
+ // TODO: it may be possible to rearrange swizzling to better pipeline data
+ b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
+ b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
+ b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
+ b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
+
+ dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
+ dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
+ dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
+ dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
+
+ aPtr += 32;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_store_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+ *realpt += dotProductVector[4];
+ *imagpt += dotProductVector[5];
+ *realpt += dotProductVector[6];
+ *imagpt += dotProductVector[7];
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *realpt += ((*aPtr++) * (*bPtr));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
}
#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
#include <immintrin.h>
-static inline void volk_32fc_32f_dot_prod_32fc_u_avx( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float res[2];
- float *realpt = &res[0], *imagpt = &res[1];
- const float* aPtr = (float*)input;
- const float* bPtr = taps;
-
- __m256 a0Val, a1Val, a2Val, a3Val;
- __m256 b0Val, b1Val, b2Val, b3Val;
- __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
- __m256 c0Val, c1Val, c2Val, c3Val;
-
- __m256 dotProdVal0 = _mm256_setzero_ps();
- __m256 dotProdVal1 = _mm256_setzero_ps();
- __m256 dotProdVal2 = _mm256_setzero_ps();
- __m256 dotProdVal3 = _mm256_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- a0Val = _mm256_loadu_ps(aPtr);
- a1Val = _mm256_loadu_ps(aPtr+8);
- a2Val = _mm256_loadu_ps(aPtr+16);
- a3Val = _mm256_loadu_ps(aPtr+24);
-
- x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
- x1Val = _mm256_loadu_ps(bPtr+8);
- x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
- x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
- x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
- x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
-
- // TODO: it may be possible to rearrange swizzling to better pipeline data
- b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
- b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
- b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
- b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
-
- c0Val = _mm256_mul_ps(a0Val, b0Val);
- c1Val = _mm256_mul_ps(a1Val, b1Val);
- c2Val = _mm256_mul_ps(a2Val, b2Val);
- c3Val = _mm256_mul_ps(a3Val, b3Val);
-
- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
- dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
- dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
-
- aPtr += 32;
- bPtr += 16;
- }
-
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- *realpt = dotProductVector[0];
- *imagpt = dotProductVector[1];
- *realpt += dotProductVector[2];
- *imagpt += dotProductVector[3];
- *realpt += dotProductVector[4];
- *imagpt += dotProductVector[5];
- *realpt += dotProductVector[6];
- *imagpt += dotProductVector[7];
-
- number = sixteenthPoints*16;
- for(;number < num_points; number++){
- *realpt += ((*aPtr++) * (*bPtr));
- *imagpt += ((*aPtr++) * (*bPtr++));
- }
-
- *result = *(lv_32fc_t*)(&res[0]);
+static inline void volk_32fc_32f_dot_prod_32fc_u_avx(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const float* taps,
+ unsigned int num_points)
+{
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* aPtr = (float*)input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val, a2Val, a3Val;
+ __m256 b0Val, b1Val, b2Val, b3Val;
+ __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
+ __m256 c0Val, c1Val, c2Val, c3Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+ __m256 dotProdVal2 = _mm256_setzero_ps();
+ __m256 dotProdVal3 = _mm256_setzero_ps();
+
+ for (; number < sixteenthPoints; number++) {
+
+ a0Val = _mm256_loadu_ps(aPtr);
+ a1Val = _mm256_loadu_ps(aPtr + 8);
+ a2Val = _mm256_loadu_ps(aPtr + 16);
+ a3Val = _mm256_loadu_ps(aPtr + 24);
+
+ x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
+ x1Val = _mm256_loadu_ps(bPtr + 8);
+ x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
+ x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
+ x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
+ x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
+
+ // TODO: it may be possible to rearrange swizzling to better pipeline data
+ b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
+ b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
+ b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
+ b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
+
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
+ c2Val = _mm256_mul_ps(a2Val, b2Val);
+ c3Val = _mm256_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 32;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_store_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+ *realpt += dotProductVector[4];
+ *imagpt += dotProductVector[5];
+ *realpt += dotProductVector[6];
+ *imagpt += dotProductVector[7];
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *realpt += ((*aPtr++) * (*bPtr));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
}
#endif /*LV_HAVE_AVX*/
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void volk_32fc_32f_dot_prod_32fc_neon_unroll ( lv_32fc_t* __restrict result, const lv_32fc_t* __restrict input, const float* __restrict taps, unsigned int num_points) {
-
- unsigned int number;
- const unsigned int quarterPoints = num_points / 8;
-
- float res[2];
- float *realpt = &res[0], *imagpt = &res[1];
- const float* inputPtr = (float*)input;
- const float* tapsPtr = taps;
- float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
- float accVector_real[4];
- float accVector_imag[4];
-
- float32x4x2_t inputVector0, inputVector1;
- float32x4_t tapsVector0, tapsVector1;
- float32x4_t tmp_real0, tmp_imag0;
- float32x4_t tmp_real1, tmp_imag1;
- float32x4_t real_accumulator0, imag_accumulator0;
- float32x4_t real_accumulator1, imag_accumulator1;
-
- // zero out accumulators
- // take a *float, return float32x4_t
- real_accumulator0 = vld1q_f32( zero );
- imag_accumulator0 = vld1q_f32( zero );
- real_accumulator1 = vld1q_f32( zero );
- imag_accumulator1 = vld1q_f32( zero );
-
- for(number=0 ;number < quarterPoints; number++){
- // load doublewords and duplicate in to second lane
- tapsVector0 = vld1q_f32(tapsPtr );
- tapsVector1 = vld1q_f32(tapsPtr+4 );
-
- // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
- inputVector0 = vld2q_f32(inputPtr );
- inputVector1 = vld2q_f32(inputPtr+8 );
- // inputVector is now a struct of two vectors, 0th is real, 1st is imag
-
- tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
- tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
-
- tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
- tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
-
- real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
- imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
-
- real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
- imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
-
- tapsPtr += 8;
- inputPtr += 16;
- }
-
- real_accumulator0 = vaddq_f32( real_accumulator0, real_accumulator1);
- imag_accumulator0 = vaddq_f32( imag_accumulator0, imag_accumulator1);
- // void vst1q_f32( float32_t * ptr, float32x4_t val);
- // store results back to a complex (array of 2 floats)
- vst1q_f32(accVector_real, real_accumulator0);
- vst1q_f32(accVector_imag, imag_accumulator0);
- *realpt = accVector_real[0] + accVector_real[1] +
- accVector_real[2] + accVector_real[3] ;
-
- *imagpt = accVector_imag[0] + accVector_imag[1] +
- accVector_imag[2] + accVector_imag[3] ;
-
- // clean up the remainder
- for(number=quarterPoints*8; number < num_points; number++){
- *realpt += ((*inputPtr++) * (*tapsPtr));
- *imagpt += ((*inputPtr++) * (*tapsPtr++));
- }
-
- *result = *(lv_32fc_t*)(&res[0]);
+static inline void
+volk_32fc_32f_dot_prod_32fc_neon_unroll(lv_32fc_t* __restrict result,
+ const lv_32fc_t* __restrict input,
+ const float* __restrict taps,
+ unsigned int num_points)
+{
+
+ unsigned int number;
+ const unsigned int quarterPoints = num_points / 8;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* inputPtr = (float*)input;
+ const float* tapsPtr = taps;
+ float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
+ float accVector_real[4];
+ float accVector_imag[4];
+
+ float32x4x2_t inputVector0, inputVector1;
+ float32x4_t tapsVector0, tapsVector1;
+ float32x4_t tmp_real0, tmp_imag0;
+ float32x4_t tmp_real1, tmp_imag1;
+ float32x4_t real_accumulator0, imag_accumulator0;
+ float32x4_t real_accumulator1, imag_accumulator1;
+
+ // zero out accumulators
+ // take a *float, return float32x4_t
+ real_accumulator0 = vld1q_f32(zero);
+ imag_accumulator0 = vld1q_f32(zero);
+ real_accumulator1 = vld1q_f32(zero);
+ imag_accumulator1 = vld1q_f32(zero);
+
+ for (number = 0; number < quarterPoints; number++) {
+ // load doublewords and duplicate in to second lane
+ tapsVector0 = vld1q_f32(tapsPtr);
+ tapsVector1 = vld1q_f32(tapsPtr + 4);
+
+ // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
+ inputVector0 = vld2q_f32(inputPtr);
+ inputVector1 = vld2q_f32(inputPtr + 8);
+ // inputVector is now a struct of two vectors, 0th is real, 1st is imag
+
+ tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
+ tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
+
+ tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
+ tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
+
+ real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
+ imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
+
+ real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
+ imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
+
+ tapsPtr += 8;
+ inputPtr += 16;
+ }
+
+ real_accumulator0 = vaddq_f32(real_accumulator0, real_accumulator1);
+ imag_accumulator0 = vaddq_f32(imag_accumulator0, imag_accumulator1);
+ // void vst1q_f32( float32_t * ptr, float32x4_t val);
+ // store results back to a complex (array of 2 floats)
+ vst1q_f32(accVector_real, real_accumulator0);
+ vst1q_f32(accVector_imag, imag_accumulator0);
+ *realpt =
+ accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3];
+
+ *imagpt =
+ accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3];
+
+ // clean up the remainder
+ for (number = quarterPoints * 8; number < num_points; number++) {
+ *realpt += ((*inputPtr++) * (*tapsPtr));
+ *imagpt += ((*inputPtr++) * (*tapsPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
}
#endif /*LV_HAVE_NEON*/
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void volk_32fc_32f_dot_prod_32fc_a_neon ( lv_32fc_t* __restrict result, const lv_32fc_t* __restrict input, const float* __restrict taps, unsigned int num_points) {
-
- unsigned int number;
- const unsigned int quarterPoints = num_points / 4;
+static inline void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t* __restrict result,
+ const lv_32fc_t* __restrict input,
+ const float* __restrict taps,
+ unsigned int num_points)
+{
- float res[2];
- float *realpt = &res[0], *imagpt = &res[1];
- const float* inputPtr = (float*)input;
- const float* tapsPtr = taps;
- float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
- float accVector_real[4];
- float accVector_imag[4];
+ unsigned int number;
+ const unsigned int quarterPoints = num_points / 4;
- float32x4x2_t inputVector;
- float32x4_t tapsVector;
- float32x4_t tmp_real, tmp_imag;
- float32x4_t real_accumulator, imag_accumulator;
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* inputPtr = (float*)input;
+ const float* tapsPtr = taps;
+ float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
+ float accVector_real[4];
+ float accVector_imag[4];
+ float32x4x2_t inputVector;
+ float32x4_t tapsVector;
+ float32x4_t tmp_real, tmp_imag;
+ float32x4_t real_accumulator, imag_accumulator;
- // zero out accumulators
- // take a *float, return float32x4_t
- real_accumulator = vld1q_f32( zero );
- imag_accumulator = vld1q_f32( zero );
- for(number=0 ;number < quarterPoints; number++){
- // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) )
- // load doublewords and duplicate in to second lane
- tapsVector = vld1q_f32(tapsPtr );
+ // zero out accumulators
+ // take a *float, return float32x4_t
+ real_accumulator = vld1q_f32(zero);
+ imag_accumulator = vld1q_f32(zero);
- // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
- inputVector = vld2q_f32(inputPtr );
+ for (number = 0; number < quarterPoints; number++) {
+ // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) )
+ // load doublewords and duplicate in to second lane
+ tapsVector = vld1q_f32(tapsPtr);
- tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
- tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
+ // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
+ inputVector = vld2q_f32(inputPtr);
- real_accumulator = vaddq_f32(real_accumulator, tmp_real);
- imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
+ tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
+ tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
+ real_accumulator = vaddq_f32(real_accumulator, tmp_real);
+ imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
- tapsPtr += 4;
- inputPtr += 8;
- }
+ tapsPtr += 4;
+ inputPtr += 8;
+ }
- // store results back to a complex (array of 2 floats)
- vst1q_f32(accVector_real, real_accumulator);
- vst1q_f32(accVector_imag, imag_accumulator);
- *realpt = accVector_real[0] + accVector_real[1] +
- accVector_real[2] + accVector_real[3] ;
+ // store results back to a complex (array of 2 floats)
+ vst1q_f32(accVector_real, real_accumulator);
+ vst1q_f32(accVector_imag, imag_accumulator);
+ *realpt =
+ accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3];
- *imagpt = accVector_imag[0] + accVector_imag[1] +
- accVector_imag[2] + accVector_imag[3] ;
+ *imagpt =
+ accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3];
- // clean up the remainder
- for(number=quarterPoints*4; number < num_points; number++){
- *realpt += ((*inputPtr++) * (*tapsPtr));
- *imagpt += ((*inputPtr++) * (*tapsPtr++));
- }
+ // clean up the remainder
+ for (number = quarterPoints * 4; number < num_points; number++) {
+ *realpt += ((*inputPtr++) * (*tapsPtr));
+ *imagpt += ((*inputPtr++) * (*tapsPtr++));
+ }
- *result = *(lv_32fc_t*)(&res[0]);
+ *result = *(lv_32fc_t*)(&res[0]);
}
#endif /*LV_HAVE_NEON*/
#ifdef LV_HAVE_NEONV7
-extern void volk_32fc_32f_dot_prod_32fc_a_neonasm ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points);
+extern void volk_32fc_32f_dot_prod_32fc_a_neonasm(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const float* taps,
+ unsigned int num_points);
#endif /*LV_HAVE_NEONV7*/
#ifdef LV_HAVE_NEONV7
-extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points);
+extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const float* taps,
+ unsigned int num_points);
#endif /*LV_HAVE_NEONV7*/
#ifdef LV_HAVE_NEONV7
-extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points);
+extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const float* taps,
+ unsigned int num_points);
#endif /*LV_HAVE_NEONV7*/
#ifdef LV_HAVE_SSE
-static inline void volk_32fc_32f_dot_prod_32fc_u_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 8;
-
- float res[2];
- float *realpt = &res[0], *imagpt = &res[1];
- const float* aPtr = (float*)input;
- const float* bPtr = taps;
-
- __m128 a0Val, a1Val, a2Val, a3Val;
- __m128 b0Val, b1Val, b2Val, b3Val;
- __m128 x0Val, x1Val, x2Val, x3Val;
- __m128 c0Val, c1Val, c2Val, c3Val;
-
- __m128 dotProdVal0 = _mm_setzero_ps();
- __m128 dotProdVal1 = _mm_setzero_ps();
- __m128 dotProdVal2 = _mm_setzero_ps();
- __m128 dotProdVal3 = _mm_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- a0Val = _mm_loadu_ps(aPtr);
- a1Val = _mm_loadu_ps(aPtr+4);
- a2Val = _mm_loadu_ps(aPtr+8);
- a3Val = _mm_loadu_ps(aPtr+12);
-
- x0Val = _mm_loadu_ps(bPtr);
- x1Val = _mm_loadu_ps(bPtr);
- x2Val = _mm_loadu_ps(bPtr+4);
- x3Val = _mm_loadu_ps(bPtr+4);
- b0Val = _mm_unpacklo_ps(x0Val, x1Val);
- b1Val = _mm_unpackhi_ps(x0Val, x1Val);
- b2Val = _mm_unpacklo_ps(x2Val, x3Val);
- b3Val = _mm_unpackhi_ps(x2Val, x3Val);
-
- c0Val = _mm_mul_ps(a0Val, b0Val);
- c1Val = _mm_mul_ps(a1Val, b1Val);
- c2Val = _mm_mul_ps(a2Val, b2Val);
- c3Val = _mm_mul_ps(a3Val, b3Val);
-
- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
-
- aPtr += 16;
- bPtr += 8;
- }
-
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-
- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- *realpt = dotProductVector[0];
- *imagpt = dotProductVector[1];
- *realpt += dotProductVector[2];
- *imagpt += dotProductVector[3];
-
- number = sixteenthPoints*8;
- for(;number < num_points; number++){
- *realpt += ((*aPtr++) * (*bPtr));
- *imagpt += ((*aPtr++) * (*bPtr++));
- }
-
- *result = *(lv_32fc_t*)(&res[0]);
+static inline void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const float* taps,
+ unsigned int num_points)
+{
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 8;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* aPtr = (float*)input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 x0Val, x1Val, x2Val, x3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for (; number < sixteenthPoints; number++) {
+
+ a0Val = _mm_loadu_ps(aPtr);
+ a1Val = _mm_loadu_ps(aPtr + 4);
+ a2Val = _mm_loadu_ps(aPtr + 8);
+ a3Val = _mm_loadu_ps(aPtr + 12);
+
+ x0Val = _mm_loadu_ps(bPtr);
+ x1Val = _mm_loadu_ps(bPtr);
+ x2Val = _mm_loadu_ps(bPtr + 4);
+ x3Val = _mm_loadu_ps(bPtr + 4);
+ b0Val = _mm_unpacklo_ps(x0Val, x1Val);
+ b1Val = _mm_unpackhi_ps(x0Val, x1Val);
+ b2Val = _mm_unpacklo_ps(x2Val, x3Val);
+ b3Val = _mm_unpackhi_ps(x2Val, x3Val);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 8;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,
+ dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+
+ number = sixteenthPoints * 8;
+ for (; number < num_points; number++) {
+ *realpt += ((*aPtr++) * (*bPtr));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
}
#endif /*LV_HAVE_SSE*/
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_32f_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points);
- * \endcode
+ * void volk_32fc_32f_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
+ * float* bVector, unsigned int num_points); \endcode
*
* \b Inputs
* \li aVector: The input vector of complex floats.
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- const float* bPtr= bVector;
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const float* bPtr = bVector;
- __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2;
+ __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2;
- __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
+ __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
- for(;number < eighthPoints; number++){
+ for (; number < eighthPoints; number++) {
- aVal1 = _mm256_load_ps((float *)aPtr);
- aPtr += 4;
+ aVal1 = _mm256_load_ps((float*)aPtr);
+ aPtr += 4;
- aVal2 = _mm256_load_ps((float *)aPtr);
- aPtr += 4;
+ aVal2 = _mm256_load_ps((float*)aPtr);
+ aPtr += 4;
- bVal = _mm256_load_ps(bPtr); // b0|b1|b2|b3|b4|b5|b6|b7
- bPtr += 8;
+ bVal = _mm256_load_ps(bPtr); // b0|b1|b2|b3|b4|b5|b6|b7
+ bPtr += 8;
- bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00); // b0|b1|b2|b3|b0|b1|b2|b3
- bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11); // b4|b5|b6|b7|b4|b5|b6|b7
+ bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00); // b0|b1|b2|b3|b0|b1|b2|b3
+ bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11); // b4|b5|b6|b7|b4|b5|b6|b7
- bVal1 = _mm256_permutevar_ps(bVal1, permute_mask); // b0|b0|b1|b1|b2|b2|b3|b3
- bVal2 = _mm256_permutevar_ps(bVal2, permute_mask); // b4|b4|b5|b5|b6|b6|b7|b7
+ bVal1 = _mm256_permutevar_ps(bVal1, permute_mask); // b0|b0|b1|b1|b2|b2|b3|b3
+ bVal2 = _mm256_permutevar_ps(bVal2, permute_mask); // b4|b4|b5|b5|b6|b6|b7|b7
- cVal1 = _mm256_mul_ps(aVal1, bVal1);
- cVal2 = _mm256_mul_ps(aVal2, bVal2);
+ cVal1 = _mm256_mul_ps(aVal1, bVal1);
+ cVal2 = _mm256_mul_ps(aVal2, bVal2);
- _mm256_store_ps((float*)cPtr,cVal1); // Store the results back into the C container
- cPtr += 4;
+ _mm256_store_ps((float*)cPtr,
+ cVal1); // Store the results back into the C container
+ cPtr += 4;
- _mm256_store_ps((float*)cPtr,cVal2); // Store the results back into the C container
- cPtr += 4;
- }
+ _mm256_store_ps((float*)cPtr,
+ cVal2); // Store the results back into the C container
+ cPtr += 4;
+ }
- number = eighthPoints * 8;
- for(;number < num_points; ++number){
- *cPtr++ = (*aPtr++) * (*bPtr++);
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; ++number) {
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- const float* bPtr= bVector;
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const float* bPtr = bVector;
- __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
- for(;number < quarterPoints; number++){
+ __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
+ for (; number < quarterPoints; number++) {
- aVal1 = _mm_load_ps((const float*)aPtr);
- aPtr += 2;
+ aVal1 = _mm_load_ps((const float*)aPtr);
+ aPtr += 2;
- aVal2 = _mm_load_ps((const float*)aPtr);
- aPtr += 2;
+ aVal2 = _mm_load_ps((const float*)aPtr);
+ aPtr += 2;
- bVal = _mm_load_ps(bPtr);
- bPtr += 4;
+ bVal = _mm_load_ps(bPtr);
+ bPtr += 4;
- bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0));
- bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2));
+ bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1, 1, 0, 0));
+ bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3, 3, 2, 2));
- cVal = _mm_mul_ps(aVal1, bVal1);
+ cVal = _mm_mul_ps(aVal1, bVal1);
- _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container
- cPtr += 2;
+ _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
+ cPtr += 2;
- cVal = _mm_mul_ps(aVal2, bVal2);
+ cVal = _mm_mul_ps(aVal2, bVal2);
- _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container
+ _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
- cPtr += 2;
- }
+ cPtr += 2;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) * (*bPtr);
- bPtr++;
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) * (*bPtr);
+ bPtr++;
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- const float* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++) * (*bPtr++);
- }
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const float* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_32fc_32f_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32fc_32f_multiply_32fc_neon(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- const float* bPtr= bVector;
- unsigned int number = 0;
- unsigned int quarter_points = num_points / 4;
-
- float32x4x2_t inputVector, outputVector;
- float32x4_t tapsVector;
- for(number = 0; number < quarter_points; number++){
- inputVector = vld2q_f32((float*)aPtr);
- tapsVector = vld1q_f32(bPtr);
-
- outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector);
- outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector);
-
- vst2q_f32((float*)cPtr, outputVector);
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
-
- for(number = quarter_points * 4; number < num_points; number++){
- *cPtr++ = (*aPtr++) * (*bPtr++);
- }
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const float* bPtr = bVector;
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+
+ float32x4x2_t inputVector, outputVector;
+ float32x4_t tapsVector;
+ for (number = 0; number < quarter_points; number++) {
+ inputVector = vld2q_f32((float*)aPtr);
+ tapsVector = vld1q_f32(bPtr);
+
+ outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector);
+ outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector);
+
+ vst2q_f32((float*)cPtr, outputVector);
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ for (number = quarter_points * 4; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_ORC
-extern void
-volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const float* bVector, unsigned int num_points);
+extern void volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const float* bVector,
+ unsigned int num_points);
-static inline void
-volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const float* bVector, unsigned int num_points)
+static inline void volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const float* bVector,
+ unsigned int num_points)
{
- volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
+ volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_GENERIC */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
- * \endcode
+ * void volk_32fc_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned
+ * int num_points) \endcode
*
* \b Inputs
* \li aVector: The input vector of complex floats.
#ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H
#define INCLUDED_volk_32fc_conjugate_32fc_u_H
+#include <float.h>
#include <inttypes.h>
#include <stdio.h>
#include <volk/volk_complex.h>
-#include <float.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32fc_conjugate_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
+static inline void volk_32fc_conjugate_32fc_u_avx(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- __m256 x;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
+ __m256 x;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
- __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+ __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
- for(;number < quarterPoints; number++){
+ for (; number < quarterPoints; number++) {
- x = _mm256_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
+ x = _mm256_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
- x = _mm256_xor_ps(x, conjugator); // conjugate register
+ x = _mm256_xor_ps(x, conjugator); // conjugate register
- _mm256_storeu_ps((float*)c,x); // Store the results back into the C container
+ _mm256_storeu_ps((float*)c, x); // Store the results back into the C container
- a += 4;
- c += 4;
- }
+ a += 4;
+ c += 4;
+ }
- number = quarterPoints * 4;
+ number = quarterPoints * 4;
- for(;number < num_points; number++) {
- *c++ = lv_conj(*a++);
- }
+ for (; number < num_points; number++) {
+ *c++ = lv_conj(*a++);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
-static inline void
-volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
+static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int halfPoints = num_points / 2;
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
- __m128 x;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
+ __m128 x;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
- __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
- for(;number < halfPoints; number++){
+ for (; number < halfPoints; number++) {
- x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
+ x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
- x = _mm_xor_ps(x, conjugator); // conjugate register
+ x = _mm_xor_ps(x, conjugator); // conjugate register
- _mm_storeu_ps((float*)c,x); // Store the results back into the C container
+ _mm_storeu_ps((float*)c, x); // Store the results back into the C container
- a += 2;
- c += 2;
- }
+ a += 2;
+ c += 2;
+ }
- if((num_points % 2) != 0) {
- *c = lv_conj(*a);
- }
+ if ((num_points % 2) != 0) {
+ *c = lv_conj(*a);
+ }
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
+static inline void volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ unsigned int num_points)
{
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- unsigned int number = 0;
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- *cPtr++ = lv_conj(*aPtr++);
- }
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = lv_conj(*aPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifndef INCLUDED_volk_32fc_conjugate_32fc_a_H
#define INCLUDED_volk_32fc_conjugate_32fc_a_H
+#include <float.h>
#include <inttypes.h>
#include <stdio.h>
#include <volk/volk_complex.h>
-#include <float.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32fc_conjugate_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
+static inline void volk_32fc_conjugate_32fc_a_avx(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- __m256 x;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
+ __m256 x;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
- __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+ __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
- for(;number < quarterPoints; number++){
+ for (; number < quarterPoints; number++) {
- x = _mm256_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
+ x = _mm256_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
- x = _mm256_xor_ps(x, conjugator); // conjugate register
+ x = _mm256_xor_ps(x, conjugator); // conjugate register
- _mm256_store_ps((float*)c,x); // Store the results back into the C container
+ _mm256_store_ps((float*)c, x); // Store the results back into the C container
- a += 4;
- c += 4;
- }
+ a += 4;
+ c += 4;
+ }
- number = quarterPoints * 4;
+ number = quarterPoints * 4;
- for(;number < num_points; number++) {
- *c++ = lv_conj(*a++);
- }
+ for (; number < num_points; number++) {
+ *c++ = lv_conj(*a++);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
-static inline void
-volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
+static inline void volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int halfPoints = num_points / 2;
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
- __m128 x;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
+ __m128 x;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
- __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
- for(;number < halfPoints; number++){
+ for (; number < halfPoints; number++) {
- x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
+ x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
- x = _mm_xor_ps(x, conjugator); // conjugate register
+ x = _mm_xor_ps(x, conjugator); // conjugate register
- _mm_store_ps((float*)c,x); // Store the results back into the C container
+ _mm_store_ps((float*)c, x); // Store the results back into the C container
- a += 2;
- c += 2;
- }
+ a += 2;
+ c += 2;
+ }
- if((num_points % 2) != 0) {
- *c = lv_conj(*a);
- }
+ if ((num_points % 2) != 0) {
+ *c = lv_conj(*a);
+ }
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
+static inline void volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ unsigned int num_points)
{
- unsigned int number;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number;
+ const unsigned int quarterPoints = num_points / 4;
- float32x4x2_t x;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
+ float32x4x2_t x;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
- for(number=0; number < quarterPoints; number++){
- __VOLK_PREFETCH(a+4);
- x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di
+ for (number = 0; number < quarterPoints; number++) {
+ __VOLK_PREFETCH(a + 4);
+ x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di
- // xor the imaginary lane
- x.val[1] = vnegq_f32( x.val[1]);
+ // xor the imaginary lane
+ x.val[1] = vnegq_f32(x.val[1]);
- vst2q_f32((float*)c,x); // Store the results back into the C container
+ vst2q_f32((float*)c, x); // Store the results back into the C container
- a += 4;
- c += 4;
- }
+ a += 4;
+ c += 4;
+ }
- for(number=quarterPoints*4; number < num_points; number++){
- *c++ = lv_conj(*a++);
- }
+ for (number = quarterPoints * 4; number < num_points; number++) {
+ *c++ = lv_conj(*a++);
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
+static inline void volk_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ unsigned int num_points)
{
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- unsigned int number = 0;
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- *cPtr++ = lv_conj(*aPtr++);
- }
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = lv_conj(*aPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_convert_16ic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points);
- * \endcode
+ * void volk_32fc_convert_16ic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector,
+ * unsigned int num_points); \endcode
*
* \b Inputs
* \li inputVector: The complex 32-bit float input data buffer.
#ifndef INCLUDED_volk_32fc_convert_16ic_a_H
#define INCLUDED_volk_32fc_convert_16ic_a_H
+#include "volk/volk_complex.h"
#include <limits.h>
#include <math.h>
-#include "volk/volk_complex.h"
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector,
+ const lv_32fc_t* inputVector,
+ unsigned int num_points)
{
const unsigned int avx_iters = num_points / 8;
const __m256 vmax_val = _mm256_set1_ps(max_val);
unsigned int i;
- for(i = 0; i < avx_iters; i++)
- {
- inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
- inputVectorPtr += 8;
- inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
- inputVectorPtr += 8;
- __VOLK_PREFETCH(inputVectorPtr + 16);
-
- // Clip
- ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
- ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
-
- intInputVal1 = _mm256_cvtps_epi32(ret1);
- intInputVal2 = _mm256_cvtps_epi32(ret2);
-
- intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
- intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
-
- _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 16;
- }
-
- for(i = avx_iters * 16; i < num_points * 2; i++)
- {
- aux = *inputVectorPtr++;
- if(aux > max_val)
- aux = max_val;
- else if(aux < min_val)
- aux = min_val;
- *outputVectorPtr++ = (int16_t)rintf(aux);
- }
+ for (i = 0; i < avx_iters; i++) {
+ inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
+ inputVectorPtr += 8;
+ inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
+ inputVectorPtr += 8;
+ __VOLK_PREFETCH(inputVectorPtr + 16);
+
+ // Clip
+ ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
+ ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
+
+ intInputVal1 = _mm256_cvtps_epi32(ret1);
+ intInputVal2 = _mm256_cvtps_epi32(ret2);
+
+ intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+ intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
+
+ _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 16;
+ }
+
+ for (i = avx_iters * 16; i < num_points * 2; i++) {
+ aux = *inputVectorPtr++;
+ if (aux > max_val)
+ aux = max_val;
+ else if (aux < min_val)
+ aux = min_val;
+ *outputVectorPtr++ = (int16_t)rintf(aux);
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector,
+ const lv_32fc_t* inputVector,
+ unsigned int num_points)
{
const unsigned int sse_iters = num_points / 4;
const __m128 vmax_val = _mm_set_ps1(max_val);
unsigned int i;
- for(i = 0; i < sse_iters; i++)
- {
- inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
- inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
- __VOLK_PREFETCH(inputVectorPtr + 8);
-
- // Clip
- ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
- ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
-
- intInputVal1 = _mm_cvtps_epi32(ret1);
- intInputVal2 = _mm_cvtps_epi32(ret2);
-
- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-
- _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 8;
- }
-
- for(i = sse_iters * 8; i < num_points * 2; i++)
- {
- aux = *inputVectorPtr++;
- if(aux > max_val)
- aux = max_val;
- else if(aux < min_val)
- aux = min_val;
- *outputVectorPtr++ = (int16_t)rintf(aux);
- }
+ for (i = 0; i < sse_iters; i++) {
+ inputVal1 = _mm_load_ps((float*)inputVectorPtr);
+ inputVectorPtr += 4;
+ inputVal2 = _mm_load_ps((float*)inputVectorPtr);
+ inputVectorPtr += 4;
+ __VOLK_PREFETCH(inputVectorPtr + 8);
+
+ // Clip
+ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ for (i = sse_iters * 8; i < num_points * 2; i++) {
+ aux = *inputVectorPtr++;
+ if (aux > max_val)
+ aux = max_val;
+ else if (aux < min_val)
+ aux = min_val;
+ *outputVectorPtr++ = (int16_t)rintf(aux);
+ }
}
#endif /* LV_HAVE_SSE2 */
#if LV_HAVE_NEONV7
#include <arm_neon.h>
-#define VCVTRQ_S32_F32(res,val) \
- __VOLK_ASM ("VCVTR.S32.F32 %[r0], %[v0]\n\t" : [r0]"=w"(res[0]) : [v0]"w"(val[0]) : ); \
- __VOLK_ASM ("VCVTR.S32.F32 %[r1], %[v1]\n\t" : [r1]"=w"(res[1]) : [v1]"w"(val[1]) : ); \
- __VOLK_ASM ("VCVTR.S32.F32 %[r2], %[v2]\n\t" : [r2]"=w"(res[2]) : [v2]"w"(val[2]) : ); \
- __VOLK_ASM ("VCVTR.S32.F32 %[r3], %[v3]\n\t" : [r3]"=w"(res[3]) : [v3]"w"(val[3]) : );
-
-static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+#define VCVTRQ_S32_F32(res, val) \
+ __VOLK_ASM("VCVTR.S32.F32 %[r0], %[v0]\n\t" \
+ : [r0] "=w"(res[0]) \
+ : [v0] "w"(val[0]) \
+ :); \
+ __VOLK_ASM("VCVTR.S32.F32 %[r1], %[v1]\n\t" \
+ : [r1] "=w"(res[1]) \
+ : [v1] "w"(val[1]) \
+ :); \
+ __VOLK_ASM("VCVTR.S32.F32 %[r2], %[v2]\n\t" \
+ : [r2] "=w"(res[2]) \
+ : [v2] "w"(val[2]) \
+ :); \
+ __VOLK_ASM("VCVTR.S32.F32 %[r3], %[v3]\n\t" : [r3] "=w"(res[3]) : [v3] "w"(val[3]) :);
+
+static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
+ const lv_32fc_t* inputVector,
+ unsigned int num_points)
{
const unsigned int neon_iters = num_points / 4;
const float32x4_t max_val = vmovq_n_f32(max_val_f);
float32x4_t ret1, ret2, a, b;
- int32x4_t toint_a={0,0,0,0};
- int32x4_t toint_b={0,0,0,0};
+ int32x4_t toint_a = { 0, 0, 0, 0 };
+ int32x4_t toint_b = { 0, 0, 0, 0 };
int16x4_t intInputVal1, intInputVal2;
int16x8_t res;
- for(i = 0; i < neon_iters; i++)
- {
- a = vld1q_f32((const float32_t*)(inputVectorPtr));
- inputVectorPtr += 4;
- b = vld1q_f32((const float32_t*)(inputVectorPtr));
- inputVectorPtr += 4;
- __VOLK_PREFETCH(inputVectorPtr + 8);
-
- ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
- ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
-
- // vcvtr takes into account the current rounding mode (as does rintf)
- VCVTRQ_S32_F32(toint_a, ret1);
- VCVTRQ_S32_F32(toint_b, ret2);
-
- intInputVal1 = vqmovn_s32(toint_a);
- intInputVal2 = vqmovn_s32(toint_b);
-
- res = vcombine_s16(intInputVal1, intInputVal2);
- vst1q_s16((int16_t*)outputVectorPtr, res);
- outputVectorPtr += 8;
- }
-
- for(i = neon_iters * 8; i < num_points * 2; i++)
- {
- aux = *inputVectorPtr++;
- if(aux > max_val_f)
- aux = max_val_f;
- else if(aux < min_val_f)
- aux = min_val_f;
- *outputVectorPtr++ = (int16_t)rintf(aux);
- }
+ for (i = 0; i < neon_iters; i++) {
+ a = vld1q_f32((const float32_t*)(inputVectorPtr));
+ inputVectorPtr += 4;
+ b = vld1q_f32((const float32_t*)(inputVectorPtr));
+ inputVectorPtr += 4;
+ __VOLK_PREFETCH(inputVectorPtr + 8);
+
+ ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
+ ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
+
+ // vcvtr takes into account the current rounding mode (as does rintf)
+ VCVTRQ_S32_F32(toint_a, ret1);
+ VCVTRQ_S32_F32(toint_b, ret2);
+
+ intInputVal1 = vqmovn_s32(toint_a);
+ intInputVal2 = vqmovn_s32(toint_b);
+
+ res = vcombine_s16(intInputVal1, intInputVal2);
+ vst1q_s16((int16_t*)outputVectorPtr, res);
+ outputVectorPtr += 8;
+ }
+
+ for (i = neon_iters * 8; i < num_points * 2; i++) {
+ aux = *inputVectorPtr++;
+ if (aux > max_val_f)
+ aux = max_val_f;
+ else if (aux < min_val_f)
+ aux = min_val_f;
+ *outputVectorPtr++ = (int16_t)rintf(aux);
+ }
}
#undef VCVTRQ_S32_F32
#if LV_HAVE_NEONV8
#include <arm_neon.h>
-static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector,
+ const lv_32fc_t* inputVector,
+ unsigned int num_points)
{
const unsigned int neon_iters = num_points / 4;
const float32x4_t max_val = vmovq_n_f32(max_val_f);
float32x4_t ret1, ret2, a, b;
- int32x4_t toint_a={0,0,0,0}, toint_b={0,0,0,0};
+ int32x4_t toint_a = { 0, 0, 0, 0 }, toint_b = { 0, 0, 0, 0 };
int16x4_t intInputVal1, intInputVal2;
int16x8_t res;
- for(i = 0; i < neon_iters; i++)
- {
- a = vld1q_f32((const float32_t*)(inputVectorPtr));
- inputVectorPtr += 4;
- b = vld1q_f32((const float32_t*)(inputVectorPtr));
- inputVectorPtr += 4;
- __VOLK_PREFETCH(inputVectorPtr + 8);
-
- ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
- ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
-
- // vrndiq takes into account the current rounding mode (as does rintf)
- toint_a = vcvtq_s32_f32(vrndiq_f32(ret1));
- toint_b = vcvtq_s32_f32(vrndiq_f32(ret2));
-
- intInputVal1 = vqmovn_s32(toint_a);
- intInputVal2 = vqmovn_s32(toint_b);
-
- res = vcombine_s16(intInputVal1, intInputVal2);
- vst1q_s16((int16_t*)outputVectorPtr, res);
- outputVectorPtr += 8;
- }
-
- for(i = neon_iters * 8; i < num_points * 2; i++)
- {
- aux = *inputVectorPtr++;
- if(aux > max_val_f)
- aux = max_val_f;
- else if(aux < min_val_f)
- aux = min_val_f;
- *outputVectorPtr++ = (int16_t)rintf(aux);
- }
+ for (i = 0; i < neon_iters; i++) {
+ a = vld1q_f32((const float32_t*)(inputVectorPtr));
+ inputVectorPtr += 4;
+ b = vld1q_f32((const float32_t*)(inputVectorPtr));
+ inputVectorPtr += 4;
+ __VOLK_PREFETCH(inputVectorPtr + 8);
+
+ ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
+ ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
+
+ // vrndiq takes into account the current rounding mode (as does rintf)
+ toint_a = vcvtq_s32_f32(vrndiq_f32(ret1));
+ toint_b = vcvtq_s32_f32(vrndiq_f32(ret2));
+
+ intInputVal1 = vqmovn_s32(toint_a);
+ intInputVal2 = vqmovn_s32(toint_b);
+
+ res = vcombine_s16(intInputVal1, intInputVal2);
+ vst1q_s16((int16_t*)outputVectorPtr, res);
+ outputVectorPtr += 8;
+ }
+
+ for (i = neon_iters * 8; i < num_points * 2; i++) {
+ aux = *inputVectorPtr++;
+ if (aux > max_val_f)
+ aux = max_val_f;
+ else if (aux < min_val_f)
+ aux = min_val_f;
+ *outputVectorPtr++ = (int16_t)rintf(aux);
+ }
}
#endif /* LV_HAVE_NEONV8 */
-
#ifdef LV_HAVE_GENERIC
-static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector,
+ const lv_32fc_t* inputVector,
+ unsigned int num_points)
{
float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector;
const float max_val = (float)SHRT_MAX;
float aux;
unsigned int i;
- for(i = 0; i < num_points * 2; i++)
- {
- aux = *inputVectorPtr++;
- if(aux > max_val)
- aux = max_val;
- else if(aux < min_val)
- aux = min_val;
- *outputVectorPtr++ = (int16_t)rintf(aux);
- }
+ for (i = 0; i < num_points * 2; i++) {
+ aux = *inputVectorPtr++;
+ if (aux > max_val)
+ aux = max_val;
+ else if (aux < min_val)
+ aux = min_val;
+ *outputVectorPtr++ = (int16_t)rintf(aux);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifndef INCLUDED_volk_32fc_convert_16ic_u_H
#define INCLUDED_volk_32fc_convert_16ic_u_H
+#include "volk/volk_complex.h"
#include <limits.h>
#include <math.h>
-#include "volk/volk_complex.h"
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector,
+ const lv_32fc_t* inputVector,
+ unsigned int num_points)
{
const unsigned int avx_iters = num_points / 8;
const __m256 vmax_val = _mm256_set1_ps(max_val);
unsigned int i;
- for(i = 0; i < avx_iters; i++)
- {
- inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
- inputVectorPtr += 8;
- inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
- inputVectorPtr += 8;
- __VOLK_PREFETCH(inputVectorPtr + 16);
-
- // Clip
- ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
- ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
-
- intInputVal1 = _mm256_cvtps_epi32(ret1);
- intInputVal2 = _mm256_cvtps_epi32(ret2);
-
- intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
- intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
-
- _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 16;
- }
-
- for(i = avx_iters * 16; i < num_points * 2; i++)
- {
- aux = *inputVectorPtr++;
- if(aux > max_val)
- aux = max_val;
- else if(aux < min_val)
- aux = min_val;
- *outputVectorPtr++ = (int16_t)rintf(aux);
- }
+ for (i = 0; i < avx_iters; i++) {
+ inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
+ inputVectorPtr += 8;
+ inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
+ inputVectorPtr += 8;
+ __VOLK_PREFETCH(inputVectorPtr + 16);
+
+ // Clip
+ ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
+ ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
+
+ intInputVal1 = _mm256_cvtps_epi32(ret1);
+ intInputVal2 = _mm256_cvtps_epi32(ret2);
+
+ intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+ intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
+
+ _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 16;
+ }
+
+ for (i = avx_iters * 16; i < num_points * 2; i++) {
+ aux = *inputVectorPtr++;
+ if (aux > max_val)
+ aux = max_val;
+ else if (aux < min_val)
+ aux = min_val;
+ *outputVectorPtr++ = (int16_t)rintf(aux);
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector,
+ const lv_32fc_t* inputVector,
+ unsigned int num_points)
{
const unsigned int sse_iters = num_points / 4;
const __m128 vmax_val = _mm_set_ps1(max_val);
unsigned int i;
- for(i = 0; i < sse_iters; i++)
- {
- inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
- inputVectorPtr += 4;
- inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
- inputVectorPtr += 4;
- __VOLK_PREFETCH(inputVectorPtr + 8);
-
- // Clip
- ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
- ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
-
- intInputVal1 = _mm_cvtps_epi32(ret1);
- intInputVal2 = _mm_cvtps_epi32(ret2);
-
- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-
- _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 8;
- }
-
- for(i = sse_iters * 8; i < num_points * 2; i++)
- {
- aux = *inputVectorPtr++;
- if(aux > max_val)
- aux = max_val;
- else if(aux < min_val)
- aux = min_val;
- *outputVectorPtr++ = (int16_t)rintf(aux);
- }
+ for (i = 0; i < sse_iters; i++) {
+ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
+ inputVectorPtr += 4;
+ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
+ inputVectorPtr += 4;
+ __VOLK_PREFETCH(inputVectorPtr + 8);
+
+ // Clip
+ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ for (i = sse_iters * 8; i < num_points * 2; i++) {
+ aux = *inputVectorPtr++;
+ if (aux > max_val)
+ aux = max_val;
+ else if (aux < min_val)
+ aux = min_val;
+ *outputVectorPtr++ = (int16_t)rintf(aux);
+ }
}
#endif /* LV_HAVE_SSE2 */
#endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_32fc_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_32fc_t*
+ * complexVector, unsigned int num_points) \endcode
*
* \b Inputs
* \li complexVector: The complex input vector.
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32fc_deinterleave_32f_x2_a_avx(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_deinterleave_32f_x2_a_avx(float* iBuffer,
+ float* qBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- const float* complexVectorPtr = (float*)complexVector;
- float* iBufferPtr = iBuffer;
- float* qBufferPtr = qBuffer;
-
- unsigned int number = 0;
- // Mask for real and imaginary parts
- const unsigned int eighthPoints = num_points / 8;
- __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
- for(;number < eighthPoints; number++){
- cplxValue1 = _mm256_load_ps(complexVectorPtr);
- complexVectorPtr += 8;
-
- cplxValue2 = _mm256_load_ps(complexVectorPtr);
- complexVectorPtr += 8;
-
- complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
- complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
-
- // Arrange in i1i2i3i4 format
- iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
- // Arrange in q1q2q3q4 format
- qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
-
- _mm256_store_ps(iBufferPtr, iValue);
- _mm256_store_ps(qBufferPtr, qValue);
-
- iBufferPtr += 8;
- qBufferPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- *qBufferPtr++ = *complexVectorPtr++;
- }
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ unsigned int number = 0;
+ // Mask for real and imaginary parts
+ const unsigned int eighthPoints = num_points / 8;
+ __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
+ for (; number < eighthPoints; number++) {
+ cplxValue1 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ cplxValue2 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
+ // Arrange in q1q2q3q4 format
+ qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
+
+ _mm256_store_ps(iBufferPtr, iValue);
+ _mm256_store_ps(qBufferPtr, qValue);
+
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer,
+ float* qBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- const float* complexVectorPtr = (float*)complexVector;
- float* iBufferPtr = iBuffer;
- float* qBufferPtr = qBuffer;
-
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
- __m128 cplxValue1, cplxValue2, iValue, qValue;
- for(;number < quarterPoints; number++){
- cplxValue1 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- cplxValue2 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- // Arrange in i1i2i3i4 format
- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
- // Arrange in q1q2q3q4 format
- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
- _mm_store_ps(iBufferPtr, iValue);
- _mm_store_ps(qBufferPtr, qValue);
-
- iBufferPtr += 4;
- qBufferPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- *qBufferPtr++ = *complexVectorPtr++;
- }
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 cplxValue1, cplxValue2, iValue, qValue;
+ for (; number < quarterPoints; number++) {
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
+
+ _mm_store_ps(iBufferPtr, iValue);
+ _mm_store_ps(qBufferPtr, qValue);
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_32fc_deinterleave_32f_x2_neon(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_deinterleave_32f_x2_neon(float* iBuffer,
+ float* qBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- unsigned int quarter_points = num_points / 4;
- const float* complexVectorPtr = (float*)complexVector;
- float* iBufferPtr = iBuffer;
- float* qBufferPtr = qBuffer;
- float32x4x2_t complexInput;
-
- for(number = 0; number < quarter_points; number++){
- complexInput = vld2q_f32(complexVectorPtr);
- vst1q_f32( iBufferPtr, complexInput.val[0] );
- vst1q_f32( qBufferPtr, complexInput.val[1] );
- complexVectorPtr += 8;
- iBufferPtr += 4;
- qBufferPtr += 4;
- }
-
- for(number = quarter_points*4; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- *qBufferPtr++ = *complexVectorPtr++;
- }
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+ float32x4x2_t complexInput;
+
+ for (number = 0; number < quarter_points; number++) {
+ complexInput = vld2q_f32(complexVectorPtr);
+ vst1q_f32(iBufferPtr, complexInput.val[0]);
+ vst1q_f32(qBufferPtr, complexInput.val[1]);
+ complexVectorPtr += 8;
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ for (number = quarter_points * 4; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32fc_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_deinterleave_32f_x2_generic(float* iBuffer,
+ float* qBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- const float* complexVectorPtr = (float*)complexVector;
- float* iBufferPtr = iBuffer;
- float* qBufferPtr = qBuffer;
- unsigned int number;
- for(number = 0; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- *qBufferPtr++ = *complexVectorPtr++;
- }
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+ unsigned int number;
+ for (number = 0; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32fc_deinterleave_32f_x2_u_avx(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_deinterleave_32f_x2_u_avx(float* iBuffer,
+ float* qBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- const float* complexVectorPtr = (float*)complexVector;
- float* iBufferPtr = iBuffer;
- float* qBufferPtr = qBuffer;
-
- unsigned int number = 0;
- // Mask for real and imaginary parts
- const unsigned int eighthPoints = num_points / 8;
- __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
- for(;number < eighthPoints; number++){
- cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
- complexVectorPtr += 8;
-
- cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
- complexVectorPtr += 8;
-
- complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
- complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
-
- // Arrange in i1i2i3i4 format
- iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
- // Arrange in q1q2q3q4 format
- qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
-
- _mm256_storeu_ps(iBufferPtr, iValue);
- _mm256_storeu_ps(qBufferPtr, qValue);
-
- iBufferPtr += 8;
- qBufferPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- *qBufferPtr++ = *complexVectorPtr++;
- }
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ unsigned int number = 0;
+ // Mask for real and imaginary parts
+ const unsigned int eighthPoints = num_points / 8;
+ __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
+ for (; number < eighthPoints; number++) {
+ cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
+ // Arrange in q1q2q3q4 format
+ qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
+
+ _mm256_storeu_ps(iBufferPtr, iValue);
+ _mm256_storeu_ps(qBufferPtr, qValue);
+
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX */
#endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_u_H */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32fc_deinterleave_64f_x2_u_avx(double *iBuffer, double *qBuffer,
- const lv_32fc_t *complexVector,
- unsigned int num_points) {
- unsigned int number = 0;
-
- const float *complexVectorPtr = (float *)complexVector;
- double *iBufferPtr = iBuffer;
- double *qBufferPtr = qBuffer;
-
- const unsigned int quarterPoints = num_points / 4;
- __m256 cplxValue;
- __m128 complexH, complexL, fVal;
- __m256d dVal;
-
- for (; number < quarterPoints; number++) {
-
- cplxValue = _mm256_loadu_ps(complexVectorPtr);
- complexVectorPtr += 8;
-
- complexH = _mm256_extractf128_ps(cplxValue, 1);
- complexL = _mm256_extractf128_ps(cplxValue, 0);
-
- // Arrange in i1i2i1i2 format
- fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2, 0, 2, 0));
- dVal = _mm256_cvtps_pd(fVal);
- _mm256_storeu_pd(iBufferPtr, dVal);
-
- // Arrange in q1q2q1q2 format
- fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3, 1, 3, 1));
- dVal = _mm256_cvtps_pd(fVal);
- _mm256_storeu_pd(qBufferPtr, dVal);
-
- iBufferPtr += 4;
- qBufferPtr += 4;
- }
-
- number = quarterPoints * 4;
- for (; number < num_points; number++) {
- *iBufferPtr++ = *complexVectorPtr++;
- *qBufferPtr++ = *complexVectorPtr++;
- }
+static inline void volk_32fc_deinterleave_64f_x2_u_avx(double* iBuffer,
+ double* qBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ const unsigned int quarterPoints = num_points / 4;
+ __m256 cplxValue;
+ __m128 complexH, complexL, fVal;
+ __m256d dVal;
+
+ for (; number < quarterPoints; number++) {
+
+ cplxValue = _mm256_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ complexH = _mm256_extractf128_ps(cplxValue, 1);
+ complexL = _mm256_extractf128_ps(cplxValue, 0);
+
+ // Arrange in i1i2i1i2 format
+ fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2, 0, 2, 0));
+ dVal = _mm256_cvtps_pd(fVal);
+ _mm256_storeu_pd(iBufferPtr, dVal);
+
+ // Arrange in q1q2q1q2 format
+ fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3, 1, 3, 1));
+ dVal = _mm256_cvtps_pd(fVal);
+ _mm256_storeu_pd(qBufferPtr, dVal);
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void
-volk_32fc_deinterleave_64f_x2_u_sse2(double *iBuffer, double *qBuffer,
- const lv_32fc_t *complexVector,
- unsigned int num_points) {
- unsigned int number = 0;
-
- const float *complexVectorPtr = (float *)complexVector;
- double *iBufferPtr = iBuffer;
- double *qBufferPtr = qBuffer;
-
- const unsigned int halfPoints = num_points / 2;
- __m128 cplxValue, fVal;
- __m128d dVal;
-
- for (; number < halfPoints; number++) {
-
- cplxValue = _mm_loadu_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- // Arrange in i1i2i1i2 format
- fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0));
- dVal = _mm_cvtps_pd(fVal);
- _mm_storeu_pd(iBufferPtr, dVal);
-
- // Arrange in q1q2q1q2 format
- fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3, 1, 3, 1));
- dVal = _mm_cvtps_pd(fVal);
- _mm_storeu_pd(qBufferPtr, dVal);
-
- iBufferPtr += 2;
- qBufferPtr += 2;
- }
-
- number = halfPoints * 2;
- for (; number < num_points; number++) {
- *iBufferPtr++ = *complexVectorPtr++;
- *qBufferPtr++ = *complexVectorPtr++;
- }
+static inline void volk_32fc_deinterleave_64f_x2_u_sse2(double* iBuffer,
+ double* qBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ const unsigned int halfPoints = num_points / 2;
+ __m128 cplxValue, fVal;
+ __m128d dVal;
+
+ for (; number < halfPoints; number++) {
+
+ cplxValue = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i1i2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_storeu_pd(iBufferPtr, dVal);
+
+ // Arrange in q1q2q1q2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3, 1, 3, 1));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_storeu_pd(qBufferPtr, dVal);
+
+ iBufferPtr += 2;
+ qBufferPtr += 2;
+ }
+
+ number = halfPoints * 2;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32fc_deinterleave_64f_x2_generic(double *iBuffer, double *qBuffer,
- const lv_32fc_t *complexVector,
- unsigned int num_points) {
- unsigned int number = 0;
- const float *complexVectorPtr = (float *)complexVector;
- double *iBufferPtr = iBuffer;
- double *qBufferPtr = qBuffer;
-
- for (number = 0; number < num_points; number++) {
- *iBufferPtr++ = (double)*complexVectorPtr++;
- *qBufferPtr++ = (double)*complexVectorPtr++;
- }
+static inline void volk_32fc_deinterleave_64f_x2_generic(double* iBuffer,
+ double* qBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ for (number = 0; number < num_points; number++) {
+ *iBufferPtr++ = (double)*complexVectorPtr++;
+ *qBufferPtr++ = (double)*complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32fc_deinterleave_64f_x2_a_avx(double *iBuffer, double *qBuffer,
- const lv_32fc_t *complexVector,
- unsigned int num_points) {
- unsigned int number = 0;
-
- const float *complexVectorPtr = (float *)complexVector;
- double *iBufferPtr = iBuffer;
- double *qBufferPtr = qBuffer;
-
- const unsigned int quarterPoints = num_points / 4;
- __m256 cplxValue;
- __m128 complexH, complexL, fVal;
- __m256d dVal;
-
- for (; number < quarterPoints; number++) {
-
- cplxValue = _mm256_load_ps(complexVectorPtr);
- complexVectorPtr += 8;
-
- complexH = _mm256_extractf128_ps(cplxValue, 1);
- complexL = _mm256_extractf128_ps(cplxValue, 0);
-
- // Arrange in i1i2i1i2 format
- fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2, 0, 2, 0));
- dVal = _mm256_cvtps_pd(fVal);
- _mm256_store_pd(iBufferPtr, dVal);
-
- // Arrange in q1q2q1q2 format
- fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3, 1, 3, 1));
- dVal = _mm256_cvtps_pd(fVal);
- _mm256_store_pd(qBufferPtr, dVal);
-
- iBufferPtr += 4;
- qBufferPtr += 4;
- }
-
- number = quarterPoints * 4;
- for (; number < num_points; number++) {
- *iBufferPtr++ = *complexVectorPtr++;
- *qBufferPtr++ = *complexVectorPtr++;
- }
+static inline void volk_32fc_deinterleave_64f_x2_a_avx(double* iBuffer,
+ double* qBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ const unsigned int quarterPoints = num_points / 4;
+ __m256 cplxValue;
+ __m128 complexH, complexL, fVal;
+ __m256d dVal;
+
+ for (; number < quarterPoints; number++) {
+
+ cplxValue = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ complexH = _mm256_extractf128_ps(cplxValue, 1);
+ complexL = _mm256_extractf128_ps(cplxValue, 0);
+
+ // Arrange in i1i2i1i2 format
+ fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2, 0, 2, 0));
+ dVal = _mm256_cvtps_pd(fVal);
+ _mm256_store_pd(iBufferPtr, dVal);
+
+ // Arrange in q1q2q1q2 format
+ fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3, 1, 3, 1));
+ dVal = _mm256_cvtps_pd(fVal);
+ _mm256_store_pd(qBufferPtr, dVal);
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void
-volk_32fc_deinterleave_64f_x2_a_sse2(double *iBuffer, double *qBuffer,
- const lv_32fc_t *complexVector,
- unsigned int num_points) {
- unsigned int number = 0;
-
- const float *complexVectorPtr = (float *)complexVector;
- double *iBufferPtr = iBuffer;
- double *qBufferPtr = qBuffer;
-
- const unsigned int halfPoints = num_points / 2;
- __m128 cplxValue, fVal;
- __m128d dVal;
-
- for (; number < halfPoints; number++) {
-
- cplxValue = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- // Arrange in i1i2i1i2 format
- fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0));
- dVal = _mm_cvtps_pd(fVal);
- _mm_store_pd(iBufferPtr, dVal);
-
- // Arrange in q1q2q1q2 format
- fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3, 1, 3, 1));
- dVal = _mm_cvtps_pd(fVal);
- _mm_store_pd(qBufferPtr, dVal);
-
- iBufferPtr += 2;
- qBufferPtr += 2;
- }
-
- number = halfPoints * 2;
- for (; number < num_points; number++) {
- *iBufferPtr++ = *complexVectorPtr++;
- *qBufferPtr++ = *complexVectorPtr++;
- }
+static inline void volk_32fc_deinterleave_64f_x2_a_sse2(double* iBuffer,
+ double* qBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ const unsigned int halfPoints = num_points / 2;
+ __m128 cplxValue, fVal;
+ __m128d dVal;
+
+ for (; number < halfPoints; number++) {
+
+ cplxValue = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i1i2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_store_pd(iBufferPtr, dVal);
+
+ // Arrange in q1q2q1q2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3, 1, 3, 1));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_store_pd(qBufferPtr, dVal);
+
+ iBufferPtr += 2;
+ qBufferPtr += 2;
+ }
+
+ number = halfPoints * 2;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32fc_deinterleave_64f_x2_a_generic(double *iBuffer, double *qBuffer,
- const lv_32fc_t *complexVector,
- unsigned int num_points) {
- unsigned int number = 0;
- const float *complexVectorPtr = (float *)complexVector;
- double *iBufferPtr = iBuffer;
- double *qBufferPtr = qBuffer;
-
- for (number = 0; number < num_points; number++) {
- *iBufferPtr++ = (double)*complexVectorPtr++;
- *qBufferPtr++ = (double)*complexVectorPtr++;
- }
+static inline void volk_32fc_deinterleave_64f_x2_a_generic(double* iBuffer,
+ double* qBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ for (number = 0; number < num_points; number++) {
+ *iBufferPtr++ = (double)*complexVectorPtr++;
+ *qBufferPtr++ = (double)*complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_NEONV8
#include <arm_neon.h>
-static inline void
-volk_32fc_deinterleave_64f_x2_neon(double *iBuffer, double *qBuffer,
- const lv_32fc_t *complexVector,
- unsigned int num_points) {
- unsigned int number = 0;
- unsigned int half_points = num_points / 2;
- const float *complexVectorPtr = (float *)complexVector;
- double *iBufferPtr = iBuffer;
- double *qBufferPtr = qBuffer;
- float32x2x2_t complexInput;
- float64x2_t iVal, qVal;
-
- for (number = 0; number < half_points; number++) {
- complexInput = vld2_f32(complexVectorPtr);
-
- iVal = vcvt_f64_f32(complexInput.val[0]);
- qVal = vcvt_f64_f32(complexInput.val[1]);
-
- vst1q_f64(iBufferPtr, iVal);
- vst1q_f64(qBufferPtr, qVal);
-
- complexVectorPtr += 4;
- iBufferPtr += 2;
- qBufferPtr += 2;
- }
-
- for (number = half_points * 2; number < num_points; number++) {
- *iBufferPtr++ = (double)*complexVectorPtr++;
- *qBufferPtr++ = (double)*complexVectorPtr++;
- }
+static inline void volk_32fc_deinterleave_64f_x2_neon(double* iBuffer,
+ double* qBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ unsigned int half_points = num_points / 2;
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+ float32x2x2_t complexInput;
+ float64x2_t iVal, qVal;
+
+ for (number = 0; number < half_points; number++) {
+ complexInput = vld2_f32(complexVectorPtr);
+
+ iVal = vcvt_f64_f32(complexInput.val[0]);
+ qVal = vcvt_f64_f32(complexInput.val[1]);
+
+ vst1q_f64(iBufferPtr, iVal);
+ vst1q_f64(qBufferPtr, qVal);
+
+ complexVectorPtr += 4;
+ iBufferPtr += 2;
+ qBufferPtr += 2;
+ }
+
+ for (number = half_points * 2; number < num_points; number++) {
+ *iBufferPtr++ = (double)*complexVectorPtr++;
+ *qBufferPtr++ = (double)*complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_NEONV8 */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_deinterleave_image_32f(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_32fc_deinterleave_image_32f(float* qBuffer, const lv_32fc_t* complexVector,
+ * unsigned int num_points) \endcode
*
* \b Inputs
* \li complexVector: The complex input vector.
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32fc_deinterleave_imag_32f_a_avx(float* qBuffer, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_deinterleave_imag_32f_a_avx(float* qBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
- const float* complexVectorPtr = (const float*)complexVector;
- float* qBufferPtr = qBuffer;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+ const float* complexVectorPtr = (const float*)complexVector;
+ float* qBufferPtr = qBuffer;
- __m256 cplxValue1, cplxValue2, complex1, complex2, qValue;
- for(;number < eighthPoints; number++){
+ __m256 cplxValue1, cplxValue2, complex1, complex2, qValue;
+ for (; number < eighthPoints; number++) {
- cplxValue1 = _mm256_load_ps(complexVectorPtr);
- complexVectorPtr += 8;
+ cplxValue1 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
- cplxValue2 = _mm256_load_ps(complexVectorPtr);
- complexVectorPtr += 8;
+ cplxValue2 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
- complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
- complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
- // Arrange in q1q2q3q4 format
- qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
+ // Arrange in q1q2q3q4 format
+ qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
- _mm256_store_ps(qBufferPtr, qValue);
+ _mm256_store_ps(qBufferPtr, qValue);
- qBufferPtr += 8;
- }
+ qBufferPtr += 8;
+ }
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- complexVectorPtr++;
- *qBufferPtr++ = *complexVectorPtr++;
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- const float* complexVectorPtr = (const float*)complexVector;
- float* qBufferPtr = qBuffer;
+ const float* complexVectorPtr = (const float*)complexVector;
+ float* qBufferPtr = qBuffer;
- __m128 cplxValue1, cplxValue2, iValue;
- for(;number < quarterPoints; number++){
+ __m128 cplxValue1, cplxValue2, iValue;
+ for (; number < quarterPoints; number++) {
- cplxValue1 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
- cplxValue2 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
- // Arrange in q1q2q3q4 format
- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+ // Arrange in q1q2q3q4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
- _mm_store_ps(qBufferPtr, iValue);
+ _mm_store_ps(qBufferPtr, iValue);
- qBufferPtr += 4;
- }
+ qBufferPtr += 4;
+ }
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- complexVectorPtr++;
- *qBufferPtr++ = *complexVectorPtr++;
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_32fc_deinterleave_imag_32f_neon(float* qBuffer, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_deinterleave_imag_32f_neon(float* qBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- unsigned int quarter_points = num_points / 4;
- const float* complexVectorPtr = (float*)complexVector;
- float* qBufferPtr = qBuffer;
- float32x4x2_t complexInput;
-
- for(number = 0; number < quarter_points; number++){
- complexInput = vld2q_f32(complexVectorPtr);
- vst1q_f32( qBufferPtr, complexInput.val[1] );
- complexVectorPtr += 8;
- qBufferPtr += 4;
- }
-
- for(number = quarter_points*4; number < num_points; number++){
- complexVectorPtr++;
- *qBufferPtr++ = *complexVectorPtr++;
- }
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* qBufferPtr = qBuffer;
+ float32x4x2_t complexInput;
+
+ for (number = 0; number < quarter_points; number++) {
+ complexInput = vld2q_f32(complexVectorPtr);
+ vst1q_f32(qBufferPtr, complexInput.val[1]);
+ complexVectorPtr += 8;
+ qBufferPtr += 4;
+ }
+
+ for (number = quarter_points * 4; number < num_points; number++) {
+ complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32fc_deinterleave_imag_32f_generic(float* qBuffer, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_deinterleave_imag_32f_generic(float* qBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const float* complexVectorPtr = (float*)complexVector;
- float* qBufferPtr = qBuffer;
- for(number = 0; number < num_points; number++){
- complexVectorPtr++;
- *qBufferPtr++ = *complexVectorPtr++;
- }
+ unsigned int number = 0;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* qBufferPtr = qBuffer;
+ for (number = 0; number < num_points; number++) {
+ complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32fc_deinterleave_imag_32f_u_avx(float* qBuffer, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_deinterleave_imag_32f_u_avx(float* qBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
- const float* complexVectorPtr = (const float*)complexVector;
- float* qBufferPtr = qBuffer;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+ const float* complexVectorPtr = (const float*)complexVector;
+ float* qBufferPtr = qBuffer;
- __m256 cplxValue1, cplxValue2, complex1, complex2, qValue;
- for(;number < eighthPoints; number++){
+ __m256 cplxValue1, cplxValue2, complex1, complex2, qValue;
+ for (; number < eighthPoints; number++) {
- cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
- complexVectorPtr += 8;
+ cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 8;
- cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
- complexVectorPtr += 8;
+ cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 8;
- complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
- complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
- // Arrange in q1q2q3q4 format
- qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
+ // Arrange in q1q2q3q4 format
+ qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
- _mm256_storeu_ps(qBufferPtr, qValue);
+ _mm256_storeu_ps(qBufferPtr, qValue);
- qBufferPtr += 8;
- }
+ qBufferPtr += 8;
+ }
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- complexVectorPtr++;
- *qBufferPtr++ = *complexVectorPtr++;
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX */
#endif /* INCLUDED_volk_32fc_deinterleave_imag_32f_u_H */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_deinterleave_real_32f(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_32fc_deinterleave_real_32f(float* iBuffer, const lv_32fc_t* complexVector,
+ * unsigned int num_points) \endcode
*
* \b Inputs
* \li complexVector: The complex input vector.
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_32fc_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_deinterleave_real_32f_a_avx2(float* iBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- const float* complexVectorPtr = (const float*)complexVector;
- float* iBufferPtr = iBuffer;
+ const float* complexVectorPtr = (const float*)complexVector;
+ float* iBufferPtr = iBuffer;
- __m256 cplxValue1, cplxValue2;
- __m256 iValue;
- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
- for(;number < eighthPoints; number++){
+ __m256 cplxValue1, cplxValue2;
+ __m256 iValue;
+ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+ for (; number < eighthPoints; number++) {
- cplxValue1 = _mm256_load_ps(complexVectorPtr);
- complexVectorPtr += 8;
+ cplxValue1 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
- cplxValue2 = _mm256_load_ps(complexVectorPtr);
- complexVectorPtr += 8;
+ cplxValue2 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
- // Arrange in i1i2i3i4 format
- iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
- iValue = _mm256_permutevar8x32_ps(iValue,idx);
+ // Arrange in i1i2i3i4 format
+ iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+ iValue = _mm256_permutevar8x32_ps(iValue, idx);
- _mm256_store_ps(iBufferPtr, iValue);
+ _mm256_store_ps(iBufferPtr, iValue);
- iBufferPtr += 8;
- }
+ iBufferPtr += 8;
+ }
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- complexVectorPtr++;
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- const float* complexVectorPtr = (const float*)complexVector;
- float* iBufferPtr = iBuffer;
+ const float* complexVectorPtr = (const float*)complexVector;
+ float* iBufferPtr = iBuffer;
- __m128 cplxValue1, cplxValue2, iValue;
- for(;number < quarterPoints; number++){
+ __m128 cplxValue1, cplxValue2, iValue;
+ for (; number < quarterPoints; number++) {
- cplxValue1 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
- cplxValue2 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
- // Arrange in i1i2i3i4 format
- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
- _mm_store_ps(iBufferPtr, iValue);
+ _mm_store_ps(iBufferPtr, iValue);
- iBufferPtr += 4;
- }
+ iBufferPtr += 4;
+ }
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- complexVectorPtr++;
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32fc_deinterleave_real_32f_generic(float* iBuffer, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_deinterleave_real_32f_generic(float* iBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const float* complexVectorPtr = (float*)complexVector;
- float* iBufferPtr = iBuffer;
- for(number = 0; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- complexVectorPtr++;
- }
+ unsigned int number = 0;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ for (number = 0; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_32fc_deinterleave_real_32f_neon(float* iBuffer, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_deinterleave_real_32f_neon(float* iBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- unsigned int quarter_points = num_points / 4;
- const float* complexVectorPtr = (float*)complexVector;
- float* iBufferPtr = iBuffer;
- float32x4x2_t complexInput;
-
- for(number = 0; number < quarter_points; number++){
- complexInput = vld2q_f32(complexVectorPtr);
- vst1q_f32( iBufferPtr, complexInput.val[0] );
- complexVectorPtr += 8;
- iBufferPtr += 4;
- }
-
- for(number = quarter_points*4; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- complexVectorPtr++;
- }
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float32x4x2_t complexInput;
+
+ for (number = 0; number < quarter_points; number++) {
+ complexInput = vld2q_f32(complexVectorPtr);
+ vst1q_f32(iBufferPtr, complexInput.val[0]);
+ complexVectorPtr += 8;
+ iBufferPtr += 4;
+ }
+
+ for (number = quarter_points * 4; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_32fc_deinterleave_real_32f_u_avx2(float* iBuffer, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_deinterleave_real_32f_u_avx2(float* iBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- const float* complexVectorPtr = (const float*)complexVector;
- float* iBufferPtr = iBuffer;
+ const float* complexVectorPtr = (const float*)complexVector;
+ float* iBufferPtr = iBuffer;
- __m256 cplxValue1, cplxValue2;
- __m256 iValue;
- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
- for(;number < eighthPoints; number++){
+ __m256 cplxValue1, cplxValue2;
+ __m256 iValue;
+ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+ for (; number < eighthPoints; number++) {
- cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
- complexVectorPtr += 8;
+ cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 8;
- cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
- complexVectorPtr += 8;
+ cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 8;
- // Arrange in i1i2i3i4 format
- iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
- iValue = _mm256_permutevar8x32_ps(iValue,idx);
+ // Arrange in i1i2i3i4 format
+ iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+ iValue = _mm256_permutevar8x32_ps(iValue, idx);
- _mm256_storeu_ps(iBufferPtr, iValue);
+ _mm256_storeu_ps(iBufferPtr, iValue);
- iBufferPtr += 8;
- }
+ iBufferPtr += 8;
+ }
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- complexVectorPtr++;
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void volk_32fc_deinterleave_real_64f_a_avx2(
- double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) {
- unsigned int number = 0;
-
- const float *complexVectorPtr = (float *)complexVector;
- double *iBufferPtr = iBuffer;
-
- const unsigned int quarterPoints = num_points / 4;
- __m256 cplxValue;
- __m128 fVal;
- __m256d dVal;
- __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
- for (; number < quarterPoints; number++) {
-
- cplxValue = _mm256_load_ps(complexVectorPtr);
- complexVectorPtr += 8;
-
- // Arrange in i1i2i1i2 format
- cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx);
- fVal = _mm256_extractf128_ps(cplxValue, 0);
- dVal = _mm256_cvtps_pd(fVal);
- _mm256_store_pd(iBufferPtr, dVal);
-
- iBufferPtr += 4;
- }
-
- number = quarterPoints * 4;
- for (; number < num_points; number++) {
- *iBufferPtr++ = (double)*complexVectorPtr++;
- complexVectorPtr++;
- }
+static inline void volk_32fc_deinterleave_real_64f_a_avx2(double* iBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+
+ const unsigned int quarterPoints = num_points / 4;
+ __m256 cplxValue;
+ __m128 fVal;
+ __m256d dVal;
+ __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
+ for (; number < quarterPoints; number++) {
+
+ cplxValue = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ // Arrange in i1i2i1i2 format
+ cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx);
+ fVal = _mm256_extractf128_ps(cplxValue, 0);
+ dVal = _mm256_cvtps_pd(fVal);
+ _mm256_store_pd(iBufferPtr, dVal);
+
+ iBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = (double)*complexVectorPtr++;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void volk_32fc_deinterleave_real_64f_a_sse2(
- double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) {
- unsigned int number = 0;
+static inline void volk_32fc_deinterleave_real_64f_a_sse2(double* iBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
- const float *complexVectorPtr = (float *)complexVector;
- double *iBufferPtr = iBuffer;
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
- const unsigned int halfPoints = num_points / 2;
- __m128 cplxValue, fVal;
- __m128d dVal;
- for (; number < halfPoints; number++) {
+ const unsigned int halfPoints = num_points / 2;
+ __m128 cplxValue, fVal;
+ __m128d dVal;
+ for (; number < halfPoints; number++) {
- cplxValue = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
+ cplxValue = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
- // Arrange in i1i2i1i2 format
- fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0));
- dVal = _mm_cvtps_pd(fVal);
- _mm_store_pd(iBufferPtr, dVal);
+ // Arrange in i1i2i1i2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_store_pd(iBufferPtr, dVal);
- iBufferPtr += 2;
- }
+ iBufferPtr += 2;
+ }
- number = halfPoints * 2;
- for (; number < num_points; number++) {
- *iBufferPtr++ = (double)*complexVectorPtr++;
- complexVectorPtr++;
- }
+ number = halfPoints * 2;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = (double)*complexVectorPtr++;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void volk_32fc_deinterleave_real_64f_generic(
- double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) {
- unsigned int number = 0;
- const float *complexVectorPtr = (float *)complexVector;
- double *iBufferPtr = iBuffer;
- for (number = 0; number < num_points; number++) {
- *iBufferPtr++ = (double)*complexVectorPtr++;
- complexVectorPtr++;
- }
+static inline void volk_32fc_deinterleave_real_64f_generic(double* iBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ for (number = 0; number < num_points; number++) {
+ *iBufferPtr++ = (double)*complexVectorPtr++;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_NEONV8
#include <arm_neon.h>
-static inline void volk_32fc_deinterleave_real_64f_neon(
- double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) {
- unsigned int number = 0;
- unsigned int quarter_points = num_points / 4;
- const float *complexVectorPtr = (float *)complexVector;
- double *iBufferPtr = iBuffer;
- float32x2x4_t complexInput;
- float64x2_t iVal1;
- float64x2_t iVal2;
- float64x2x2_t iVal;
-
- for (number = 0; number < quarter_points; number++) {
- // Load data into register
- complexInput = vld4_f32(complexVectorPtr);
-
- // Perform single to double precision conversion
- iVal1 = vcvt_f64_f32(complexInput.val[0]);
- iVal2 = vcvt_f64_f32(complexInput.val[2]);
- iVal.val[0] = iVal1;
- iVal.val[1] = iVal2;
-
- // Store results into memory buffer
- vst2q_f64(iBufferPtr, iVal);
-
- // Update pointers
- iBufferPtr += 4;
- complexVectorPtr += 8;
- }
-
- for (number = quarter_points * 4; number < num_points; number++) {
- *iBufferPtr++ = (double)*complexVectorPtr++;
- complexVectorPtr++;
- }
+static inline void volk_32fc_deinterleave_real_64f_neon(double* iBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ float32x2x4_t complexInput;
+ float64x2_t iVal1;
+ float64x2_t iVal2;
+ float64x2x2_t iVal;
+
+ for (number = 0; number < quarter_points; number++) {
+ // Load data into register
+ complexInput = vld4_f32(complexVectorPtr);
+
+ // Perform single to double precision conversion
+ iVal1 = vcvt_f64_f32(complexInput.val[0]);
+ iVal2 = vcvt_f64_f32(complexInput.val[2]);
+ iVal.val[0] = iVal1;
+ iVal.val[1] = iVal2;
+
+ // Store results into memory buffer
+ vst2q_f64(iBufferPtr, iVal);
+
+ // Update pointers
+ iBufferPtr += 4;
+ complexVectorPtr += 8;
+ }
+
+ for (number = quarter_points * 4; number < num_points; number++) {
+ *iBufferPtr++ = (double)*complexVectorPtr++;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void volk_32fc_deinterleave_real_64f_u_avx2(
- double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) {
- unsigned int number = 0;
-
- const float *complexVectorPtr = (float *)complexVector;
- double *iBufferPtr = iBuffer;
-
- const unsigned int quarterPoints = num_points / 4;
- __m256 cplxValue;
- __m128 fVal;
- __m256d dVal;
- __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
- for (; number < quarterPoints; number++) {
-
- cplxValue = _mm256_loadu_ps(complexVectorPtr);
- complexVectorPtr += 8;
-
- // Arrange in i1i2i1i2 format
- cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx);
- fVal = _mm256_extractf128_ps(cplxValue, 0);
- dVal = _mm256_cvtps_pd(fVal);
- _mm256_storeu_pd(iBufferPtr, dVal);
-
- iBufferPtr += 4;
- }
-
- number = quarterPoints * 4;
- for (; number < num_points; number++) {
- *iBufferPtr++ = (double)*complexVectorPtr++;
- complexVectorPtr++;
- }
+static inline void volk_32fc_deinterleave_real_64f_u_avx2(double* iBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+
+ const unsigned int quarterPoints = num_points / 4;
+ __m256 cplxValue;
+ __m128 fVal;
+ __m256d dVal;
+ __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
+ for (; number < quarterPoints; number++) {
+
+ cplxValue = _mm256_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ // Arrange in i1i2i1i2 format
+ cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx);
+ fVal = _mm256_extractf128_ps(cplxValue, 0);
+ dVal = _mm256_cvtps_pd(fVal);
+ _mm256_storeu_pd(iBufferPtr, dVal);
+
+ iBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = (double)*complexVectorPtr++;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifndef INCLUDED_volk_32fc_index_max_16u_a_H
#define INCLUDED_volk_32fc_index_max_16u_a_H
-#include <volk/volk_common.h>
#include <inttypes.h>
-#include <stdio.h>
#include <limits.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
#include <volk/volk_complex.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void
-volk_32fc_index_max_16u_a_avx2(uint16_t* target, lv_32fc_t* src0,
- uint32_t num_points)
+volk_32fc_index_max_16u_a_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
{
- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
- // Branchless version, if we think it'll make a difference
- //num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
-
- const uint32_t num_bytes = num_points*8;
-
- union bit256 holderf;
- union bit256 holderi;
- float sq_dist = 0.0;
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+ // Branchless version, if we think it'll make a difference
+ // num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
- union bit256 xmm5, xmm4;
- __m256 xmm1, xmm2, xmm3;
- __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+ const uint32_t num_bytes = num_points * 8;
- xmm5.int_vec = xmmfive = _mm256_setzero_si256();
- xmm4.int_vec = xmmfour = _mm256_setzero_si256();
- holderf.int_vec = holder0 = _mm256_setzero_si256();
- holderi.int_vec = holder1 = _mm256_setzero_si256();
+ union bit256 holderf;
+ union bit256 holderi;
+ float sq_dist = 0.0;
- int bound = num_bytes >> 6;
- int i = 0;
+ union bit256 xmm5, xmm4;
+ __m256 xmm1, xmm2, xmm3;
+ __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
- xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
- xmm9 = _mm256_setzero_si256(); //=xmm8
- xmm10 = _mm256_set1_epi32(8);
- xmm3 = _mm256_setzero_ps();
+ xmm5.int_vec = xmmfive = _mm256_setzero_si256();
+ xmm4.int_vec = xmmfour = _mm256_setzero_si256();
+ holderf.int_vec = holder0 = _mm256_setzero_si256();
+ holderi.int_vec = holder1 = _mm256_setzero_si256();
- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
- for(; i < bound; ++i) {
- xmm1 = _mm256_load_ps((float*)src0);
- xmm2 = _mm256_load_ps((float*)&src0[4]);
+ int bound = num_bytes >> 6;
+ int i = 0;
- src0 += 8;
+ xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+ xmm9 = _mm256_setzero_si256(); //=xmm8
+ xmm10 = _mm256_set1_epi32(8);
+ xmm3 = _mm256_setzero_ps();
- xmm1 = _mm256_mul_ps(xmm1, xmm1);
- xmm2 = _mm256_mul_ps(xmm2, xmm2);
+ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+ for (; i < bound; ++i) {
+ xmm1 = _mm256_load_ps((float*)src0);
+ xmm2 = _mm256_load_ps((float*)&src0[4]);
- xmm1 = _mm256_hadd_ps(xmm1, xmm2);
- xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
+ src0 += 8;
- xmm3 = _mm256_max_ps(xmm1, xmm3);
+ xmm1 = _mm256_mul_ps(xmm1, xmm1);
+ xmm2 = _mm256_mul_ps(xmm2, xmm2);
- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+ xmm1 = _mm256_hadd_ps(xmm1, xmm2);
+ xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+ xmm3 = _mm256_max_ps(xmm1, xmm3);
- xmm9 = _mm256_add_epi32(xmm11, xmm12);
+ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
- xmm8 = _mm256_add_epi32(xmm8, xmm10);
- }
- xmm10 = _mm256_set1_epi32(4);
- if (num_bytes >> 5 & 1) {
- xmm1 = _mm256_load_ps((float*)src0);
+ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
- src0 += 4;
+ xmm9 = _mm256_add_epi32(xmm11, xmm12);
- xmm1 = _mm256_mul_ps(xmm1, xmm1);
+ xmm8 = _mm256_add_epi32(xmm8, xmm10);
+ }
+ xmm10 = _mm256_set1_epi32(4);
+ if (num_bytes >> 5 & 1) {
+ xmm1 = _mm256_load_ps((float*)src0);
- xmm1 = _mm256_hadd_ps(xmm1, xmm1);
- xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
+ src0 += 4;
- xmm3 = _mm256_max_ps(xmm1, xmm3);
+ xmm1 = _mm256_mul_ps(xmm1, xmm1);
- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+ xmm1 = _mm256_hadd_ps(xmm1, xmm1);
+ xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+ xmm3 = _mm256_max_ps(xmm1, xmm3);
- xmm9 = _mm256_add_epi32(xmm11, xmm12);
+ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
- xmm8 = _mm256_add_epi32(xmm8, xmm10);
- }
+ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
- idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
- xmm10 = _mm256_set1_epi32(2);
- if (num_bytes >> 4 & 1) {
- xmm2 = _mm256_load_ps((float*)src0);
+ xmm9 = _mm256_add_epi32(xmm11, xmm12);
- xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
- xmm8 = bit256_p(&xmm1)->int_vec;
+ xmm8 = _mm256_add_epi32(xmm8, xmm10);
+ }
- xmm2 = _mm256_mul_ps(xmm2, xmm2);
+ idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0);
+ xmm10 = _mm256_set1_epi32(2);
+ if (num_bytes >> 4 & 1) {
+ xmm2 = _mm256_load_ps((float*)src0);
- src0 += 2;
+ xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
+ xmm8 = bit256_p(&xmm1)->int_vec;
- xmm1 = _mm256_hadd_ps(xmm2, xmm2);
+ xmm2 = _mm256_mul_ps(xmm2, xmm2);
- xmm3 = _mm256_max_ps(xmm1, xmm3);
+ src0 += 2;
- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
-
- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+ xmm1 = _mm256_hadd_ps(xmm2, xmm2);
- xmm9 = _mm256_add_epi32(xmm11, xmm12);
+ xmm3 = _mm256_max_ps(xmm1, xmm3);
- xmm8 = _mm256_add_epi32(xmm8, xmm10);
- }
+ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
- /*
- idx = _mm256_setzero_si256();
- for(i = 0; i < leftovers2; ++i) {
- //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
- sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
+ xmm9 = _mm256_add_epi32(xmm11, xmm12);
- //xmm = _mm_load1_ps(&sq_dist);//insert?
- xmm2 = _mm256_set1_ps(sq_dist);
- //xmm2 = _mm256_insertf128_ps(xmm2, xmm, 0);
+ xmm8 = _mm256_add_epi32(xmm8, xmm10);
+ }
- xmm1 = xmm3;
+ /*
+ idx = _mm256_setzero_si256();
+ for(i = 0; i < leftovers2; ++i) {
+ //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1],
+ ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
- xmm3 = _mm256_max_ps(xmm3, xmm2);//only lowest 32bit value
- xmm3 = _mm256_permutevar8x32_ps(xmm3, idx);
+ sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) *
+ lv_cimag(src0[0]);
- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+ //xmm = _mm_load1_ps(&sq_dist);//insert?
+ xmm2 = _mm256_set1_ps(sq_dist);
+ //xmm2 = _mm256_insertf128_ps(xmm2, xmm, 0);
- xmm8 = _mm256_permutevar8x32_epi32(xmm8, idx);
+ xmm1 = xmm3;
- xmm11 = _mm256_and_si256(xmm8, xmm4.int_vec);
- xmm12 = _mm256_and_si256(xmm9, xmm5.int_vec);
+ xmm3 = _mm256_max_ps(xmm3, xmm2);//only lowest 32bit value
+ xmm3 = _mm256_permutevar8x32_ps(xmm3, idx);
- xmm9 = _mm256_add_epi32(xmm11, xmm12);
-}*/
+ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
- _mm256_store_ps((float*)&(holderf.f), xmm3);
- _mm256_store_si256(&(holderi.int_vec), xmm9);
+ xmm8 = _mm256_permutevar8x32_epi32(xmm8, idx);
- target[0] = holderi.i[0];
- sq_dist = holderf.f[0];
- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
- target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
- sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
- target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
- sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
- target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
- sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
- target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
- sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
+ xmm11 = _mm256_and_si256(xmm8, xmm4.int_vec);
+ xmm12 = _mm256_and_si256(xmm9, xmm5.int_vec);
+ xmm9 = _mm256_add_epi32(xmm11, xmm12);
+ }*/
+
+ _mm256_store_ps((float*)&(holderf.f), xmm3);
+ _mm256_store_si256(&(holderi.int_vec), xmm9);
+
+ target[0] = holderi.i[0];
+ sq_dist = holderf.f[0];
+ target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+ sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+ target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+ target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
+ sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
+ target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
+ sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
+ target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
+ sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
+ target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
+ sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
}
#endif /*LV_HAVE_AVX2*/
#ifdef LV_HAVE_SSE3
-#include <xmmintrin.h>
#include <pmmintrin.h>
+#include <xmmintrin.h>
static inline void
-volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0,
- uint32_t num_points)
+volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
{
- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
- // Branchless version, if we think it'll make a difference
- //num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+ // Branchless version, if we think it'll make a difference
+ // num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
- const uint32_t num_bytes = num_points*8;
+ const uint32_t num_bytes = num_points * 8;
- union bit128 holderf;
- union bit128 holderi;
- float sq_dist = 0.0;
+ union bit128 holderf;
+ union bit128 holderi;
+ float sq_dist = 0.0;
- union bit128 xmm5, xmm4;
- __m128 xmm1, xmm2, xmm3;
- __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+ union bit128 xmm5, xmm4;
+ __m128 xmm1, xmm2, xmm3;
+ __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
- xmm5.int_vec = xmmfive = _mm_setzero_si128();
- xmm4.int_vec = xmmfour = _mm_setzero_si128();
- holderf.int_vec = holder0 = _mm_setzero_si128();
- holderi.int_vec = holder1 = _mm_setzero_si128();
+ xmm5.int_vec = xmmfive = _mm_setzero_si128();
+ xmm4.int_vec = xmmfour = _mm_setzero_si128();
+ holderf.int_vec = holder0 = _mm_setzero_si128();
+ holderi.int_vec = holder1 = _mm_setzero_si128();
- int bound = num_bytes >> 5;
- int i = 0;
+ int bound = num_bytes >> 5;
+ int i = 0;
- xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order!
- xmm9 = _mm_setzero_si128();
- xmm10 = _mm_set_epi32(4, 4, 4, 4);
- xmm3 = _mm_setzero_ps();
- //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
+ xmm8 = _mm_set_epi32(3, 2, 1, 0); // remember the crazy reverse order!
+ xmm9 = _mm_setzero_si128();
+ xmm10 = _mm_set_epi32(4, 4, 4, 4);
+ xmm3 = _mm_setzero_ps();
+ // printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1],
+ // ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
- for(; i < bound; ++i) {
- xmm1 = _mm_load_ps((float*)src0);
- xmm2 = _mm_load_ps((float*)&src0[2]);
+ for (; i < bound; ++i) {
+ xmm1 = _mm_load_ps((float*)src0);
+ xmm2 = _mm_load_ps((float*)&src0[2]);
- src0 += 4;
+ src0 += 4;
- xmm1 = _mm_mul_ps(xmm1, xmm1);
- xmm2 = _mm_mul_ps(xmm2, xmm2);
+ xmm1 = _mm_mul_ps(xmm1, xmm1);
+ xmm2 = _mm_mul_ps(xmm2, xmm2);
- xmm1 = _mm_hadd_ps(xmm1, xmm2);
+ xmm1 = _mm_hadd_ps(xmm1, xmm2);
- xmm3 = _mm_max_ps(xmm1, xmm3);
+ xmm3 = _mm_max_ps(xmm1, xmm3);
- xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
- xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
- xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
- xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
- xmm9 = _mm_add_epi32(xmm11, xmm12);
+ xmm9 = _mm_add_epi32(xmm11, xmm12);
- xmm8 = _mm_add_epi32(xmm8, xmm10);
+ xmm8 = _mm_add_epi32(xmm8, xmm10);
- //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
- //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]);
- }
+ // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1],
+ // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n",
+ // ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2],
+ // ((uint32_t*)&xmm10)[3]);
+ }
- if (num_bytes >> 4 & 1) {
- xmm2 = _mm_load_ps((float*)src0);
+ if (num_bytes >> 4 & 1) {
+ xmm2 = _mm_load_ps((float*)src0);
- xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
- xmm8 = bit128_p(&xmm1)->int_vec;
+ xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
+ xmm8 = bit128_p(&xmm1)->int_vec;
- xmm2 = _mm_mul_ps(xmm2, xmm2);
+ xmm2 = _mm_mul_ps(xmm2, xmm2);
- src0 += 2;
+ src0 += 2;
- xmm1 = _mm_hadd_ps(xmm2, xmm2);
+ xmm1 = _mm_hadd_ps(xmm2, xmm2);
- xmm3 = _mm_max_ps(xmm1, xmm3);
+ xmm3 = _mm_max_ps(xmm1, xmm3);
- xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]);
+ xmm10 = _mm_set_epi32(2, 2, 2, 2); // load1_ps((float*)&init[2]);
- xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
- xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
- xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
- xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
- xmm9 = _mm_add_epi32(xmm11, xmm12);
+ xmm9 = _mm_add_epi32(xmm11, xmm12);
- xmm8 = _mm_add_epi32(xmm8, xmm10);
- //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
- }
+ xmm8 = _mm_add_epi32(xmm8, xmm10);
+ // printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1],
+ // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+ }
- if (num_bytes >> 3 & 1) {
- //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+ if (num_bytes >> 3 & 1) {
+ // printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1],
+ // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
- sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
+ sq_dist =
+ lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
- xmm2 = _mm_load1_ps(&sq_dist);
+ xmm2 = _mm_load1_ps(&sq_dist);
- xmm1 = xmm3;
+ xmm1 = xmm3;
- xmm3 = _mm_max_ss(xmm3, xmm2);
+ xmm3 = _mm_max_ss(xmm3, xmm2);
- xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
- xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
- xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
+ xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
- xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
- xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
+ xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
+ xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
- xmm9 = _mm_add_epi32(xmm11, xmm12);
- }
+ xmm9 = _mm_add_epi32(xmm11, xmm12);
+ }
- //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
- //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+ // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1],
+ // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n",
+ // ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2],
+ // ((uint32_t*)&xmm9)[3]);
- _mm_store_ps((float*)&(holderf.f), xmm3);
- _mm_store_si128(&(holderi.int_vec), xmm9);
+ _mm_store_ps((float*)&(holderf.f), xmm3);
+ _mm_store_si128(&(holderi.int_vec), xmm9);
- target[0] = holderi.i[0];
- sq_dist = holderf.f[0];
- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+ target[0] = holderi.i[0];
+ sq_dist = holderf.f[0];
+ target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+ sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+ target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
- /*
- float placeholder = 0.0;
- uint32_t temp0, temp1;
- uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
- uint32_t l0 = g0 ^ 1;
+ /*
+ float placeholder = 0.0;
+ uint32_t temp0, temp1;
+ uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
+ uint32_t l0 = g0 ^ 1;
- uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
- uint32_t l1 = g1 ^ 1;
+ uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
+ uint32_t l1 = g1 ^ 1;
- temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
- temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
- sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
- placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
+ temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
+ temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
+ sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
+ placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
- g0 = (sq_dist > placeholder);
- l0 = g0 ^ 1;
- target[0] = g0 * temp0 + l0 * temp1;
- */
+ g0 = (sq_dist > placeholder);
+ l0 = g0 ^ 1;
+ target[0] = g0 * temp0 + l0 * temp1;
+ */
}
#endif /*LV_HAVE_SSE3*/
#ifdef LV_HAVE_GENERIC
static inline void
- volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0,
- uint32_t num_points)
+volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
{
- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
- const uint32_t num_bytes = num_points*8;
+ const uint32_t num_bytes = num_points * 8;
- float sq_dist = 0.0;
- float max = 0.0;
- uint16_t index = 0;
+ float sq_dist = 0.0;
+ float max = 0.0;
+ uint16_t index = 0;
- uint32_t i = 0;
+ uint32_t i = 0;
- for(; i < num_bytes >> 3; ++i) {
- sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
+ for (; i<num_bytes>> 3; ++i) {
+ sq_dist =
+ lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
- index = sq_dist > max ? i : index;
- max = sq_dist > max ? sq_dist : max;
- }
- target[0] = index;
+ index = sq_dist > max ? i : index;
+ max = sq_dist > max ? sq_dist : max;
+ }
+ target[0] = index;
}
#endif /*LV_HAVE_GENERIC*/
#ifndef INCLUDED_volk_32fc_index_max_16u_u_H
#define INCLUDED_volk_32fc_index_max_16u_u_H
-#include <volk/volk_common.h>
#include <inttypes.h>
-#include <stdio.h>
#include <limits.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
#include <volk/volk_complex.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void
-volk_32fc_index_max_16u_u_avx2(uint16_t* target, lv_32fc_t* src0,
- uint32_t num_points)
+volk_32fc_index_max_16u_u_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
{
- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
- // Branchless version, if we think it'll make a difference
- //num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+ // Branchless version, if we think it'll make a difference
+ // num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
- const uint32_t num_bytes = num_points*8;
+ const uint32_t num_bytes = num_points * 8;
- union bit256 holderf;
- union bit256 holderi;
- float sq_dist = 0.0;
+ union bit256 holderf;
+ union bit256 holderi;
+ float sq_dist = 0.0;
- union bit256 xmm5, xmm4;
- __m256 xmm1, xmm2, xmm3;
- __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+ union bit256 xmm5, xmm4;
+ __m256 xmm1, xmm2, xmm3;
+ __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
- xmm5.int_vec = xmmfive = _mm256_setzero_si256();
- xmm4.int_vec = xmmfour = _mm256_setzero_si256();
- holderf.int_vec = holder0 = _mm256_setzero_si256();
- holderi.int_vec = holder1 = _mm256_setzero_si256();
+ xmm5.int_vec = xmmfive = _mm256_setzero_si256();
+ xmm4.int_vec = xmmfour = _mm256_setzero_si256();
+ holderf.int_vec = holder0 = _mm256_setzero_si256();
+ holderi.int_vec = holder1 = _mm256_setzero_si256();
- int bound = num_bytes >> 6;
- int i = 0;
+ int bound = num_bytes >> 6;
+ int i = 0;
- xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
- xmm9 = _mm256_setzero_si256(); //=xmm8
- xmm10 = _mm256_set1_epi32(8);
- xmm3 = _mm256_setzero_ps();
+ xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+ xmm9 = _mm256_setzero_si256(); //=xmm8
+ xmm10 = _mm256_set1_epi32(8);
+ xmm3 = _mm256_setzero_ps();
- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
- for(; i < bound; ++i) {
- xmm1 = _mm256_loadu_ps((float*)src0);
- xmm2 = _mm256_loadu_ps((float*)&src0[4]);
+ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+ for (; i < bound; ++i) {
+ xmm1 = _mm256_loadu_ps((float*)src0);
+ xmm2 = _mm256_loadu_ps((float*)&src0[4]);
- src0 += 8;
+ src0 += 8;
- xmm1 = _mm256_mul_ps(xmm1, xmm1);
- xmm2 = _mm256_mul_ps(xmm2, xmm2);
+ xmm1 = _mm256_mul_ps(xmm1, xmm1);
+ xmm2 = _mm256_mul_ps(xmm2, xmm2);
- xmm1 = _mm256_hadd_ps(xmm1, xmm2);
- xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
+ xmm1 = _mm256_hadd_ps(xmm1, xmm2);
+ xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
- xmm3 = _mm256_max_ps(xmm1, xmm3);
+ xmm3 = _mm256_max_ps(xmm1, xmm3);
- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
- xmm9 = _mm256_add_epi32(xmm11, xmm12);
+ xmm9 = _mm256_add_epi32(xmm11, xmm12);
- xmm8 = _mm256_add_epi32(xmm8, xmm10);
- }
- xmm10 = _mm256_set1_epi32(4);
- if (num_bytes >> 5 & 1) {
- xmm1 = _mm256_loadu_ps((float*)src0);
+ xmm8 = _mm256_add_epi32(xmm8, xmm10);
+ }
+ xmm10 = _mm256_set1_epi32(4);
+ if (num_bytes >> 5 & 1) {
+ xmm1 = _mm256_loadu_ps((float*)src0);
- src0 += 4;
+ src0 += 4;
- xmm1 = _mm256_mul_ps(xmm1, xmm1);
+ xmm1 = _mm256_mul_ps(xmm1, xmm1);
- xmm1 = _mm256_hadd_ps(xmm1, xmm1);
- xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
+ xmm1 = _mm256_hadd_ps(xmm1, xmm1);
+ xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
- xmm3 = _mm256_max_ps(xmm1, xmm3);
+ xmm3 = _mm256_max_ps(xmm1, xmm3);
- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
- xmm9 = _mm256_add_epi32(xmm11, xmm12);
+ xmm9 = _mm256_add_epi32(xmm11, xmm12);
- xmm8 = _mm256_add_epi32(xmm8, xmm10);
- }
+ xmm8 = _mm256_add_epi32(xmm8, xmm10);
+ }
- idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
- xmm10 = _mm256_set1_epi32(2);
- if (num_bytes >> 4 & 1) {
- xmm2 = _mm256_loadu_ps((float*)src0);
+ idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0);
+ xmm10 = _mm256_set1_epi32(2);
+ if (num_bytes >> 4 & 1) {
+ xmm2 = _mm256_loadu_ps((float*)src0);
- xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
- xmm8 = bit256_p(&xmm1)->int_vec;
+ xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
+ xmm8 = bit256_p(&xmm1)->int_vec;
- xmm2 = _mm256_mul_ps(xmm2, xmm2);
+ xmm2 = _mm256_mul_ps(xmm2, xmm2);
- src0 += 2;
+ src0 += 2;
- xmm1 = _mm256_hadd_ps(xmm2, xmm2);
+ xmm1 = _mm256_hadd_ps(xmm2, xmm2);
- xmm3 = _mm256_max_ps(xmm1, xmm3);
+ xmm3 = _mm256_max_ps(xmm1, xmm3);
- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
- xmm9 = _mm256_add_epi32(xmm11, xmm12);
+ xmm9 = _mm256_add_epi32(xmm11, xmm12);
+
+ xmm8 = _mm256_add_epi32(xmm8, xmm10);
+ }
- xmm8 = _mm256_add_epi32(xmm8, xmm10);
- }
-
- _mm256_storeu_ps((float*)&(holderf.f), xmm3);
- _mm256_storeu_si256(&(holderi.int_vec), xmm9);
-
- target[0] = holderi.i[0];
- sq_dist = holderf.f[0];
- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
- target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
- sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
- target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
- sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
- target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
- sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
- target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
- sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
+ _mm256_storeu_ps((float*)&(holderf.f), xmm3);
+ _mm256_storeu_si256(&(holderi.int_vec), xmm9);
+ target[0] = holderi.i[0];
+ sq_dist = holderf.f[0];
+ target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+ sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+ target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+ target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
+ sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
+ target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
+ sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
+ target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
+ sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
+ target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
+ sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
}
#endif /*LV_HAVE_AVX2*/
#ifndef INCLUDED_volk_32fc_index_max_32u_a_H
#define INCLUDED_volk_32fc_index_max_32u_a_H
+#include <inttypes.h>
+#include <stdio.h>
#include <volk/volk_common.h>
-#include<inttypes.h>
-#include<stdio.h>
-#include<volk/volk_complex.h>
+#include <volk/volk_complex.h>
#ifdef LV_HAVE_AVX2
-#include<immintrin.h>
+#include <immintrin.h>
static inline void
-volk_32fc_index_max_32u_a_avx2(uint32_t* target, lv_32fc_t* src0,
- uint32_t num_points)
+volk_32fc_index_max_32u_a_avx2(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
{
- const uint32_t num_bytes = num_points*8;
+ const uint32_t num_bytes = num_points * 8;
- union bit256 holderf;
- union bit256 holderi;
- float sq_dist = 0.0;
+ union bit256 holderf;
+ union bit256 holderi;
+ float sq_dist = 0.0;
- union bit256 xmm5, xmm4;
- __m256 xmm1, xmm2, xmm3;
- __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+ union bit256 xmm5, xmm4;
+ __m256 xmm1, xmm2, xmm3;
+ __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
- xmm5.int_vec = xmmfive = _mm256_setzero_si256();
- xmm4.int_vec = xmmfour = _mm256_setzero_si256();
- holderf.int_vec = holder0 = _mm256_setzero_si256();
- holderi.int_vec = holder1 = _mm256_setzero_si256();
+ xmm5.int_vec = xmmfive = _mm256_setzero_si256();
+ xmm4.int_vec = xmmfour = _mm256_setzero_si256();
+ holderf.int_vec = holder0 = _mm256_setzero_si256();
+ holderi.int_vec = holder1 = _mm256_setzero_si256();
- int bound = num_bytes >> 6;
- int i = 0;
+ int bound = num_bytes >> 6;
+ int i = 0;
- xmm8 = _mm256_set_epi32(7,6,5,4,3, 2, 1, 0);
- xmm9 = _mm256_setzero_si256();
- xmm10 = _mm256_set1_epi32(8);
- xmm3 = _mm256_setzero_ps();
- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
+ xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+ xmm9 = _mm256_setzero_si256();
+ xmm10 = _mm256_set1_epi32(8);
+ xmm3 = _mm256_setzero_ps();
+ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
- for(; i < bound; ++i) {
- xmm1 = _mm256_load_ps((float*)src0);
- xmm2 = _mm256_load_ps((float*)&src0[4]);
+ for (; i < bound; ++i) {
+ xmm1 = _mm256_load_ps((float*)src0);
+ xmm2 = _mm256_load_ps((float*)&src0[4]);
- src0 += 8;
+ src0 += 8;
- xmm1 = _mm256_mul_ps(xmm1, xmm1);
- xmm2 = _mm256_mul_ps(xmm2, xmm2);
+ xmm1 = _mm256_mul_ps(xmm1, xmm1);
+ xmm2 = _mm256_mul_ps(xmm2, xmm2);
- xmm1 = _mm256_hadd_ps(xmm1, xmm2);
- xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
+ xmm1 = _mm256_hadd_ps(xmm1, xmm2);
+ xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
- xmm3 = _mm256_max_ps(xmm1, xmm3);
+ xmm3 = _mm256_max_ps(xmm1, xmm3);
- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
- xmm9 = _mm256_add_epi32(xmm11, xmm12);
+ xmm9 = _mm256_add_epi32(xmm11, xmm12);
- xmm8 = _mm256_add_epi32(xmm8, xmm10);
- }
-
- xmm10 = _mm256_set1_epi32(4);
- if (num_bytes >> 5 & 1) {
- xmm1 = _mm256_load_ps((float*)src0);
-
- xmm1 = _mm256_mul_ps(xmm1, xmm1);
+ xmm8 = _mm256_add_epi32(xmm8, xmm10);
+ }
- src0 += 4;
+ xmm10 = _mm256_set1_epi32(4);
+ if (num_bytes >> 4 & 1) {
+ xmm1 = _mm256_load_ps((float*)src0);
- xmm1 = _mm256_hadd_ps(xmm1, xmm1);
+ xmm1 = _mm256_mul_ps(xmm1, xmm1);
- xmm3 = _mm256_max_ps(xmm1, xmm3);
+ src0 += 4;
- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+ xmm1 = _mm256_hadd_ps(xmm1, xmm1);
- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+ xmm3 = _mm256_max_ps(xmm1, xmm3);
- xmm9 = _mm256_add_epi32(xmm11, xmm12);
+ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
- xmm8 = _mm256_add_epi32(xmm8, xmm10);
- }
+ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
- idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
- xmm10 = _mm256_set1_epi32(2);
- if (num_bytes >> 4 & 1) {
- xmm2 = _mm256_load_ps((float*)src0);
+ xmm9 = _mm256_add_epi32(xmm11, xmm12);
- xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
- xmm8 = bit256_p(&xmm1)->int_vec;
+ xmm8 = _mm256_add_epi32(xmm8, xmm10);
+ }
- xmm2 = _mm256_mul_ps(xmm2, xmm2);
+ idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0);
+ xmm10 = _mm256_set1_epi32(2);
+ if (num_bytes >> 4 & 1) {
+ xmm2 = _mm256_load_ps((float*)src0);
- src0 += 2;
+ xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
+ xmm8 = bit256_p(&xmm1)->int_vec;
- xmm1 = _mm256_hadd_ps(xmm2, xmm2);
+ xmm2 = _mm256_mul_ps(xmm2, xmm2);
- xmm3 = _mm256_max_ps(xmm1, xmm3);
+ src0 += 2;
- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+ xmm1 = _mm256_hadd_ps(xmm2, xmm2);
- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+ xmm3 = _mm256_max_ps(xmm1, xmm3);
- xmm9 = _mm256_add_epi32(xmm11, xmm12);
+ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
- xmm8 = _mm256_add_epi32(xmm8, xmm10);
- }
+ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
- _mm256_store_ps((float*)&(holderf.f), xmm3);
- _mm256_store_si256(&(holderi.int_vec), xmm9);
+ xmm9 = _mm256_add_epi32(xmm11, xmm12);
- target[0] = holderi.i[0];
- sq_dist = holderf.f[0];
- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
- target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
- sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
- target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
- sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
- target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
- sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
- target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
- sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
+ xmm8 = _mm256_add_epi32(xmm8, xmm10);
+ }
+ _mm256_store_ps((float*)&(holderf.f), xmm3);
+ _mm256_store_si256(&(holderi.int_vec), xmm9);
+
+ target[0] = holderi.i[0];
+ sq_dist = holderf.f[0];
+ target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+ sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+ target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+ target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
+ sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
+ target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
+ sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
+ target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
+ sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
+ target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
+ sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
}
#endif /*LV_HAVE_AVX2*/
#ifdef LV_HAVE_SSE3
-#include<xmmintrin.h>
-#include<pmmintrin.h>
+#include <pmmintrin.h>
+#include <xmmintrin.h>
static inline void
-volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0,
- uint32_t num_points)
+volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
{
- const uint32_t num_bytes = num_points*8;
-
- union bit128 holderf;
- union bit128 holderi;
- float sq_dist = 0.0;
-
- union bit128 xmm5, xmm4;
- __m128 xmm1, xmm2, xmm3;
- __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+ const uint32_t num_bytes = num_points * 8;
- xmm5.int_vec = xmmfive = _mm_setzero_si128();
- xmm4.int_vec = xmmfour = _mm_setzero_si128();
- holderf.int_vec = holder0 = _mm_setzero_si128();
- holderi.int_vec = holder1 = _mm_setzero_si128();
+ union bit128 holderf;
+ union bit128 holderi;
+ float sq_dist = 0.0;
- int bound = num_bytes >> 5;
- int i = 0;
+ union bit128 xmm5, xmm4;
+ __m128 xmm1, xmm2, xmm3;
+ __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
- xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order!
- xmm9 = _mm_setzero_si128();
- xmm10 = _mm_set_epi32(4, 4, 4, 4);
- xmm3 = _mm_setzero_ps();
+ xmm5.int_vec = xmmfive = _mm_setzero_si128();
+ xmm4.int_vec = xmmfour = _mm_setzero_si128();
+ holderf.int_vec = holder0 = _mm_setzero_si128();
+ holderi.int_vec = holder1 = _mm_setzero_si128();
- //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
+ int bound = num_bytes >> 5;
+ int i = 0;
- for(; i < bound; ++i) {
- xmm1 = _mm_load_ps((float*)src0);
- xmm2 = _mm_load_ps((float*)&src0[2]);
+ xmm8 = _mm_set_epi32(3, 2, 1, 0); // remember the crazy reverse order!
+ xmm9 = _mm_setzero_si128();
+ xmm10 = _mm_set_epi32(4, 4, 4, 4);
+ xmm3 = _mm_setzero_ps();
- src0 += 4;
+ // printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1],
+ // ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
- xmm1 = _mm_mul_ps(xmm1, xmm1);
- xmm2 = _mm_mul_ps(xmm2, xmm2);
+ for (; i < bound; ++i) {
+ xmm1 = _mm_load_ps((float*)src0);
+ xmm2 = _mm_load_ps((float*)&src0[2]);
- xmm1 = _mm_hadd_ps(xmm1, xmm2);
+ src0 += 4;
- xmm3 = _mm_max_ps(xmm1, xmm3);
+ xmm1 = _mm_mul_ps(xmm1, xmm1);
+ xmm2 = _mm_mul_ps(xmm2, xmm2);
- xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
- xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+ xmm1 = _mm_hadd_ps(xmm1, xmm2);
- xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
- xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+ xmm3 = _mm_max_ps(xmm1, xmm3);
- xmm9 = _mm_add_epi32(xmm11, xmm12);
+ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
- xmm8 = _mm_add_epi32(xmm8, xmm10);
+ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
- //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
- //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]);
- }
+ xmm9 = _mm_add_epi32(xmm11, xmm12);
+ xmm8 = _mm_add_epi32(xmm8, xmm10);
- if (num_bytes >> 4 & 1) {
- xmm2 = _mm_load_ps((float*)src0);
-
- xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
- xmm8 = bit128_p(&xmm1)->int_vec;
-
- xmm2 = _mm_mul_ps(xmm2, xmm2);
-
- src0 += 2;
-
- xmm1 = _mm_hadd_ps(xmm2, xmm2);
+ // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1],
+ // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n",
+ // ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2],
+ // ((uint32_t*)&xmm10)[3]);
+ }
- xmm3 = _mm_max_ps(xmm1, xmm3);
- xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]);
+ if (num_bytes >> 4 & 1) {
+ xmm2 = _mm_load_ps((float*)src0);
- xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
- xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+ xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
+ xmm8 = bit128_p(&xmm1)->int_vec;
- xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
- xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+ xmm2 = _mm_mul_ps(xmm2, xmm2);
- xmm9 = _mm_add_epi32(xmm11, xmm12);
+ src0 += 2;
- xmm8 = _mm_add_epi32(xmm8, xmm10);
- //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
- }
+ xmm1 = _mm_hadd_ps(xmm2, xmm2);
- if (num_bytes >> 3 & 1) {
- //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+ xmm3 = _mm_max_ps(xmm1, xmm3);
- sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
+ xmm10 = _mm_set_epi32(2, 2, 2, 2); // load1_ps((float*)&init[2]);
- xmm2 = _mm_load1_ps(&sq_dist);
+ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
- xmm1 = xmm3;
+ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
- xmm3 = _mm_max_ss(xmm3, xmm2);
+ xmm9 = _mm_add_epi32(xmm11, xmm12);
- xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
- xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+ xmm8 = _mm_add_epi32(xmm8, xmm10);
+ // printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1],
+ // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+ }
- xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
+ if (num_bytes >> 3 & 1) {
+ // printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1],
+ // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
- xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
- xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
+ sq_dist =
+ lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
- xmm9 = _mm_add_epi32(xmm11, xmm12);
- }
+ xmm2 = _mm_load1_ps(&sq_dist);
- //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
- //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+ xmm1 = xmm3;
- _mm_store_ps((float*)&(holderf.f), xmm3);
- _mm_store_si128(&(holderi.int_vec), xmm9);
+ xmm3 = _mm_max_ss(xmm3, xmm2);
- target[0] = holderi.i[0];
- sq_dist = holderf.f[0];
- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
- /*
- float placeholder = 0.0;
- uint32_t temp0, temp1;
- uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
- uint32_t l0 = g0 ^ 1;
+ xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
- uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
- uint32_t l1 = g1 ^ 1;
+ xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
+ xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
- temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
- temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
- sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
- placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
+ xmm9 = _mm_add_epi32(xmm11, xmm12);
+ }
- g0 = (sq_dist > placeholder);
- l0 = g0 ^ 1;
- target[0] = g0 * temp0 + l0 * temp1;
- */
+ // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1],
+ // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n",
+ // ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2],
+ // ((uint32_t*)&xmm9)[3]);
+
+ _mm_store_ps((float*)&(holderf.f), xmm3);
+ _mm_store_si128(&(holderi.int_vec), xmm9);
+
+ target[0] = holderi.i[0];
+ sq_dist = holderf.f[0];
+ target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+ sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+ target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+
+ /*
+ float placeholder = 0.0;
+ uint32_t temp0, temp1;
+ uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
+ uint32_t l0 = g0 ^ 1;
+
+ uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
+ uint32_t l1 = g1 ^ 1;
+
+ temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
+ temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
+ sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
+ placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
+
+ g0 = (sq_dist > placeholder);
+ l0 = g0 ^ 1;
+ target[0] = g0 * temp0 + l0 * temp1;
+ */
}
#endif /*LV_HAVE_SSE3*/
#ifdef LV_HAVE_GENERIC
static inline void
- volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0,
- uint32_t num_points)
+volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
{
- const uint32_t num_bytes = num_points*8;
+ const uint32_t num_bytes = num_points * 8;
- float sq_dist = 0.0;
- float max = 0.0;
- uint32_t index = 0;
+ float sq_dist = 0.0;
+ float max = 0.0;
+ uint32_t index = 0;
- uint32_t i = 0;
+ uint32_t i = 0;
- for(; i < num_bytes >> 3; ++i) {
- sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
+ for (; i<num_bytes>> 3; ++i) {
+ sq_dist =
+ lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
- index = sq_dist > max ? i : index;
- max = sq_dist > max ? sq_dist : max;
- }
- target[0] = index;
+ index = sq_dist > max ? i : index;
+ max = sq_dist > max ? sq_dist : max;
+ }
+ target[0] = index;
}
#endif /*LV_HAVE_GENERIC*/
#ifndef INCLUDED_volk_32fc_index_max_32u_u_H
#define INCLUDED_volk_32fc_index_max_32u_u_H
+#include <inttypes.h>
+#include <stdio.h>
#include <volk/volk_common.h>
-#include<inttypes.h>
-#include<stdio.h>
-#include<volk/volk_complex.h>
+#include <volk/volk_complex.h>
#ifdef LV_HAVE_AVX2
-#include<immintrin.h>
+#include <immintrin.h>
static inline void
-volk_32fc_index_max_32u_u_avx2(uint32_t* target, lv_32fc_t* src0,
- uint32_t num_points)
+volk_32fc_index_max_32u_u_avx2(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
{
- const uint32_t num_bytes = num_points*8;
-
- union bit256 holderf;
- union bit256 holderi;
- float sq_dist = 0.0;
+ const uint32_t num_bytes = num_points * 8;
- union bit256 xmm5, xmm4;
- __m256 xmm1, xmm2, xmm3;
- __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+ union bit256 holderf;
+ union bit256 holderi;
+ float sq_dist = 0.0;
- xmm5.int_vec = xmmfive = _mm256_setzero_si256();
- xmm4.int_vec = xmmfour = _mm256_setzero_si256();
- holderf.int_vec = holder0 = _mm256_setzero_si256();
- holderi.int_vec = holder1 = _mm256_setzero_si256();
+ union bit256 xmm5, xmm4;
+ __m256 xmm1, xmm2, xmm3;
+ __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
- int bound = num_bytes >> 6;
- int i = 0;
+ xmm5.int_vec = xmmfive = _mm256_setzero_si256();
+ xmm4.int_vec = xmmfour = _mm256_setzero_si256();
+ holderf.int_vec = holder0 = _mm256_setzero_si256();
+ holderi.int_vec = holder1 = _mm256_setzero_si256();
- xmm8 = _mm256_set_epi32(7,6,5,4,3, 2, 1, 0);
- xmm9 = _mm256_setzero_si256();
- xmm10 = _mm256_set1_epi32(8);
- xmm3 = _mm256_setzero_ps();
- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
+ int bound = num_bytes >> 6;
+ int i = 0;
- for(; i < bound; ++i) {
- xmm1 = _mm256_loadu_ps((float*)src0);
- xmm2 = _mm256_loadu_ps((float*)&src0[4]);
+ xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+ xmm9 = _mm256_setzero_si256();
+ xmm10 = _mm256_set1_epi32(8);
+ xmm3 = _mm256_setzero_ps();
+ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
- src0 += 8;
+ for (; i < bound; ++i) {
+ xmm1 = _mm256_loadu_ps((float*)src0);
+ xmm2 = _mm256_loadu_ps((float*)&src0[4]);
- xmm1 = _mm256_mul_ps(xmm1, xmm1);
- xmm2 = _mm256_mul_ps(xmm2, xmm2);
+ src0 += 8;
- xmm1 = _mm256_hadd_ps(xmm1, xmm2);
- xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
+ xmm1 = _mm256_mul_ps(xmm1, xmm1);
+ xmm2 = _mm256_mul_ps(xmm2, xmm2);
- xmm3 = _mm256_max_ps(xmm1, xmm3);
+ xmm1 = _mm256_hadd_ps(xmm1, xmm2);
+ xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+ xmm3 = _mm256_max_ps(xmm1, xmm3);
- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
- xmm9 = _mm256_add_epi32(xmm11, xmm12);
+ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
- xmm8 = _mm256_add_epi32(xmm8, xmm10);
- }
+ xmm9 = _mm256_add_epi32(xmm11, xmm12);
- xmm10 = _mm256_set1_epi32(4);
- if (num_bytes >> 5 & 1) {
- xmm1 = _mm256_loadu_ps((float*)src0);
-
- xmm1 = _mm256_mul_ps(xmm1, xmm1);
+ xmm8 = _mm256_add_epi32(xmm8, xmm10);
+ }
- src0 += 4;
+ xmm10 = _mm256_set1_epi32(4);
+ if (num_bytes >> 4 & 1) {
+ xmm1 = _mm256_loadu_ps((float*)src0);
- xmm1 = _mm256_hadd_ps(xmm1, xmm1);
+ xmm1 = _mm256_mul_ps(xmm1, xmm1);
- xmm3 = _mm256_max_ps(xmm1, xmm3);
+ src0 += 4;
- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+ xmm1 = _mm256_hadd_ps(xmm1, xmm1);
- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+ xmm3 = _mm256_max_ps(xmm1, xmm3);
- xmm9 = _mm256_add_epi32(xmm11, xmm12);
+ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
- xmm8 = _mm256_add_epi32(xmm8, xmm10);
- }
+ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
- idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
- xmm10 = _mm256_set1_epi32(2);
- if (num_bytes >> 4 & 1) {
- xmm2 = _mm256_loadu_ps((float*)src0);
+ xmm9 = _mm256_add_epi32(xmm11, xmm12);
- xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
- xmm8 = bit256_p(&xmm1)->int_vec;
+ xmm8 = _mm256_add_epi32(xmm8, xmm10);
+ }
- xmm2 = _mm256_mul_ps(xmm2, xmm2);
+ idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0);
+ xmm10 = _mm256_set1_epi32(2);
+ if (num_bytes >> 4 & 1) {
+ xmm2 = _mm256_loadu_ps((float*)src0);
- src0 += 2;
+ xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
+ xmm8 = bit256_p(&xmm1)->int_vec;
- xmm1 = _mm256_hadd_ps(xmm2, xmm2);
+ xmm2 = _mm256_mul_ps(xmm2, xmm2);
- xmm3 = _mm256_max_ps(xmm1, xmm3);
+ src0 += 2;
- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+ xmm1 = _mm256_hadd_ps(xmm2, xmm2);
- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+ xmm3 = _mm256_max_ps(xmm1, xmm3);
- xmm9 = _mm256_add_epi32(xmm11, xmm12);
+ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
- xmm8 = _mm256_add_epi32(xmm8, xmm10);
- }
+ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
- _mm256_storeu_ps((float*)&(holderf.f), xmm3);
- _mm256_storeu_si256(&(holderi.int_vec), xmm9);
+ xmm9 = _mm256_add_epi32(xmm11, xmm12);
- target[0] = holderi.i[0];
- sq_dist = holderf.f[0];
- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
- target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
- sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
- target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
- sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
- target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
- sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
- target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
- sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
+ xmm8 = _mm256_add_epi32(xmm8, xmm10);
+ }
+ _mm256_storeu_ps((float*)&(holderf.f), xmm3);
+ _mm256_storeu_si256(&(holderi.int_vec), xmm9);
+
+ target[0] = holderi.i[0];
+ sq_dist = holderf.f[0];
+ target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+ sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+ target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+ target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
+ sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
+ target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
+ sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
+ target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
+ sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
+ target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
+ sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
}
#endif /*LV_HAVE_AVX2*/
#include <arm_neon.h>
#include <volk/volk_neon_intrinsics.h>
-static inline void volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
+static inline void
+volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
{
unsigned int number = 0;
const uint32_t quarter_points = num_points / 4;
const lv_32fc_t* src0Ptr = src0;
-
- uint32_t indices[4] = {0, 1, 2, 3};
+
+ uint32_t indices[4] = { 0, 1, 2, 3 };
const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
uint32x4_t vec_indices = vld1q_u32(indices);
uint32x4_t vec_max_indices = vec_indices;
-
- if(num_points)
- {
+
+ if (num_points) {
float max = *src0Ptr;
uint32_t index = 0;
-
+
float32x4_t vec_max = vdupq_n_f32(*src0Ptr);
-
- for(;number < quarter_points; number++)
- {
+
+ for (; number < quarter_points; number++) {
// Load complex and compute magnitude squared
- const float32x4_t vec_mag2 = _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr));
- __VOLK_PREFETCH(src0Ptr+=4);
+ const float32x4_t vec_mag2 =
+ _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr));
+ __VOLK_PREFETCH(src0Ptr += 4);
// a > b?
const uint32x4_t gt_mask = vcgtq_f32(vec_mag2, vec_max);
vec_max = vbslq_f32(gt_mask, vec_mag2, vec_max);
float tmp_max[4];
vst1q_u32(tmp_max_indices, vec_max_indices);
vst1q_f32(tmp_max, vec_max);
-
+
for (int i = 0; i < 4; i++) {
if (tmp_max[i] > max) {
max = tmp_max[i];
index = tmp_max_indices[i];
}
}
-
+
// Deal with the rest
- for(number = quarter_points * 4;number < num_points; number++)
- {
+ for (number = quarter_points * 4; number < num_points; number++) {
const float re = lv_creal(*src0Ptr);
const float im = lv_cimag(*src0Ptr);
- if ((re*re+im*im) > max) {
+ if ((re * re + im * im) > max) {
max = *src0Ptr;
index = number;
}
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_magnitude_32f(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_32fc_magnitude_32f(float* magnitudeVector, const lv_32fc_t* complexVector,
+ * unsigned int num_points) \endcode
*
* \b Inputs
* \li complexVector: The complex input vector.
#define INCLUDED_volk_32fc_magnitude_32f_u_H
#include <inttypes.h>
-#include <stdio.h>
#include <math.h>
+#include <stdio.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
#include <volk/volk_avx_intrinsics.h>
-static inline void
-volk_32fc_magnitude_32f_u_avx(float* magnitudeVector, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_magnitude_32f_u_avx(float* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- const float* complexVectorPtr = (float*) complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
-
- __m256 cplxValue1, cplxValue2, result;
-
- for(; number < eighthPoints; number++){
- cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
- cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8);
- result = _mm256_magnitude_ps(cplxValue1, cplxValue2);
- _mm256_storeu_ps(magnitudeVectorPtr, result);
-
- complexVectorPtr += 16;
- magnitudeVectorPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- float val1Real = *complexVectorPtr++;
- float val1Imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
- }
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m256 cplxValue1, cplxValue2, result;
+
+ for (; number < eighthPoints; number++) {
+ cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+ cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8);
+ result = _mm256_magnitude_ps(cplxValue1, cplxValue2);
+ _mm256_storeu_ps(magnitudeVectorPtr, result);
+
+ complexVectorPtr += 16;
+ magnitudeVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
}
#endif /* LV_HAVE_AVX */
#include <pmmintrin.h>
#include <volk/volk_sse3_intrinsics.h>
-static inline void
-volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- const float* complexVectorPtr = (float*) complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
- __m128 cplxValue1, cplxValue2, result;
- for(; number < quarterPoints; number++){
- cplxValue1 = _mm_loadu_ps(complexVectorPtr);
- complexVectorPtr += 4;
+ __m128 cplxValue1, cplxValue2, result;
+ for (; number < quarterPoints; number++) {
+ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
- cplxValue2 = _mm_loadu_ps(complexVectorPtr);
- complexVectorPtr += 4;
+ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
- result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2);
+ result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2);
- _mm_storeu_ps(magnitudeVectorPtr, result);
- magnitudeVectorPtr += 4;
- }
+ _mm_storeu_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- float val1Real = *complexVectorPtr++;
- float val1Imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
#include <volk/volk_sse_intrinsics.h>
+#include <xmmintrin.h>
-static inline void
-volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- const float* complexVectorPtr = (float*) complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
- __m128 cplxValue1, cplxValue2, result;
+ __m128 cplxValue1, cplxValue2, result;
- for(; number < quarterPoints; number++){
- cplxValue1 = _mm_loadu_ps(complexVectorPtr);
- complexVectorPtr += 4;
+ for (; number < quarterPoints; number++) {
+ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
- cplxValue2 = _mm_loadu_ps(complexVectorPtr);
- complexVectorPtr += 4;
+ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
- result = _mm_magnitude_ps(cplxValue1, cplxValue2);
- _mm_storeu_ps(magnitudeVectorPtr, result);
- magnitudeVectorPtr += 4;
- }
+ result = _mm_magnitude_ps(cplxValue1, cplxValue2);
+ _mm_storeu_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- float val1Real = *complexVectorPtr++;
- float val1Imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32fc_magnitude_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
+static inline void volk_32fc_magnitude_32f_generic(float* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- const float* complexVectorPtr = (float*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
- unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- const float real = *complexVectorPtr++;
- const float imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
- }
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for (number = 0; number < num_points; number++) {
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
+ }
}
#endif /* LV_HAVE_GENERIC */
-
#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
#ifndef INCLUDED_volk_32fc_magnitude_32f_a_H
#define INCLUDED_volk_32fc_magnitude_32f_a_H
#include <inttypes.h>
-#include <stdio.h>
#include <math.h>
+#include <stdio.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
#include <volk/volk_avx_intrinsics.h>
-static inline void
-volk_32fc_magnitude_32f_a_avx(float* magnitudeVector, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_magnitude_32f_a_avx(float* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- const float* complexVectorPtr = (float*) complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
-
- __m256 cplxValue1, cplxValue2, result;
- for(; number < eighthPoints; number++){
- cplxValue1 = _mm256_load_ps(complexVectorPtr);
- complexVectorPtr += 8;
-
- cplxValue2 = _mm256_load_ps(complexVectorPtr);
- complexVectorPtr += 8;
-
- result = _mm256_magnitude_ps(cplxValue1, cplxValue2);
- _mm256_store_ps(magnitudeVectorPtr, result);
- magnitudeVectorPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- float val1Real = *complexVectorPtr++;
- float val1Imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
- }
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m256 cplxValue1, cplxValue2, result;
+ for (; number < eighthPoints; number++) {
+ cplxValue1 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ cplxValue2 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ result = _mm256_magnitude_ps(cplxValue1, cplxValue2);
+ _mm256_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
}
#endif /* LV_HAVE_AVX */
#include <pmmintrin.h>
#include <volk/volk_sse3_intrinsics.h>
-static inline void
-volk_32fc_magnitude_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_magnitude_32f_a_sse3(float* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- const float* complexVectorPtr = (float*) complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
-
- __m128 cplxValue1, cplxValue2, result;
- for(; number < quarterPoints; number++){
- cplxValue1 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- cplxValue2 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2);
- _mm_store_ps(magnitudeVectorPtr, result);
- magnitudeVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- float val1Real = *complexVectorPtr++;
- float val1Imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
- }
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+ for (; number < quarterPoints; number++) {
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2);
+ _mm_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
#include <volk/volk_sse_intrinsics.h>
+#include <xmmintrin.h>
-static inline void
-volk_32fc_magnitude_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_magnitude_32f_a_sse(float* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- const float* complexVectorPtr = (float*) complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
-
- __m128 cplxValue1, cplxValue2, result;
- for(; number < quarterPoints; number++){
- cplxValue1 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- cplxValue2 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- result = _mm_magnitude_ps(cplxValue1, cplxValue2);
- _mm_store_ps(magnitudeVectorPtr, result);
- magnitudeVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- float val1Real = *complexVectorPtr++;
- float val1Imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
- }
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+ for (; number < quarterPoints; number++) {
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ result = _mm_magnitude_ps(cplxValue1, cplxValue2);
+ _mm_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32fc_magnitude_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_magnitude_32f_a_generic(float* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- const float* complexVectorPtr = (float*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
- unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- const float real = *complexVectorPtr++;
- const float imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
- }
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for (number = 0; number < num_points; number++) {
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_32fc_magnitude_32f_neon(float* magnitudeVector, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_magnitude_32f_neon(float* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number;
- unsigned int quarter_points = num_points / 4;
- const float* complexVectorPtr = (float*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
-
- float32x4x2_t complex_vec;
- float32x4_t magnitude_vec;
- for(number = 0; number < quarter_points; number++){
- complex_vec = vld2q_f32(complexVectorPtr);
- complex_vec.val[0] = vmulq_f32(complex_vec.val[0], complex_vec.val[0]);
- magnitude_vec = vmlaq_f32(complex_vec.val[0], complex_vec.val[1], complex_vec.val[1]);
- magnitude_vec = vrsqrteq_f32(magnitude_vec);
- magnitude_vec = vrecpeq_f32( magnitude_vec ); // no plain ol' sqrt
- vst1q_f32(magnitudeVectorPtr, magnitude_vec);
-
- complexVectorPtr += 8;
- magnitudeVectorPtr += 4;
- }
-
- for(number = quarter_points*4; number < num_points; number++){
- const float real = *complexVectorPtr++;
- const float imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
- }
+ unsigned int number;
+ unsigned int quarter_points = num_points / 4;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ float32x4x2_t complex_vec;
+ float32x4_t magnitude_vec;
+ for (number = 0; number < quarter_points; number++) {
+ complex_vec = vld2q_f32(complexVectorPtr);
+ complex_vec.val[0] = vmulq_f32(complex_vec.val[0], complex_vec.val[0]);
+ magnitude_vec =
+ vmlaq_f32(complex_vec.val[0], complex_vec.val[1], complex_vec.val[1]);
+ magnitude_vec = vrsqrteq_f32(magnitude_vec);
+ magnitude_vec = vrecpeq_f32(magnitude_vec); // no plain ol' sqrt
+ vst1q_f32(magnitudeVectorPtr, magnitude_vec);
+
+ complexVectorPtr += 8;
+ magnitudeVectorPtr += 4;
+ }
+
+ for (number = quarter_points * 4; number < num_points; number++) {
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_NEON
/*!
- \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \brief Calculates the magnitude of the complexVector and stores the results in the
+ magnitudeVector
This is an approximation from "Streamlining Digital Signal Processing" by
Richard Lyons. Apparently max error is about 1% and mean error is about 0.6%.
\param complexVector The vector containing the complex input values
\param magnitudeVector The vector containing the real output values
- \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ \param num_points The number of complex values in complexVector to be calculated and
+ stored into cVector
*/
-static inline void
-volk_32fc_magnitude_32f_neon_fancy_sweet(float* magnitudeVector, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_magnitude_32f_neon_fancy_sweet(
+ float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
{
- unsigned int number;
- unsigned int quarter_points = num_points / 4;
- const float* complexVectorPtr = (float*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
-
- const float threshold = 0.4142135;
-
- float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low;
- a_high = vdupq_n_f32( 0.84 );
- b_high = vdupq_n_f32( 0.561);
- a_low = vdupq_n_f32( 0.99 );
- b_low = vdupq_n_f32( 0.197);
-
- uint32x4_t comp0, comp1;
-
- float32x4x2_t complex_vec;
- float32x4_t min_vec, max_vec, magnitude_vec;
- float32x4_t real_abs, imag_abs;
- for(number = 0; number < quarter_points; number++){
- complex_vec = vld2q_f32(complexVectorPtr);
-
- real_abs = vabsq_f32(complex_vec.val[0]);
- imag_abs = vabsq_f32(complex_vec.val[1]);
-
- min_vec = vminq_f32(real_abs, imag_abs);
- max_vec = vmaxq_f32(real_abs, imag_abs);
-
- // effective branch to choose coefficient pair.
- comp0 = vcgtq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
- comp1 = vcleq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
-
- // and 0s or 1s with coefficients from previous effective branch
- a_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)a_high),
- vandq_s32((int32x4_t)comp1, (int32x4_t)a_low));
- b_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)b_high),
- vandq_s32((int32x4_t)comp1, (int32x4_t)b_low));
-
- // coefficients chosen, do the weighted sum
- min_vec = vmulq_f32(min_vec, b_vec);
- max_vec = vmulq_f32(max_vec, a_vec);
-
- magnitude_vec = vaddq_f32(min_vec, max_vec);
- vst1q_f32(magnitudeVectorPtr, magnitude_vec);
-
- complexVectorPtr += 8;
- magnitudeVectorPtr += 4;
- }
-
- for(number = quarter_points*4; number < num_points; number++){
- const float real = *complexVectorPtr++;
- const float imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
- }
+ unsigned int number;
+ unsigned int quarter_points = num_points / 4;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ const float threshold = 0.4142135;
+
+ float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low;
+ a_high = vdupq_n_f32(0.84);
+ b_high = vdupq_n_f32(0.561);
+ a_low = vdupq_n_f32(0.99);
+ b_low = vdupq_n_f32(0.197);
+
+ uint32x4_t comp0, comp1;
+
+ float32x4x2_t complex_vec;
+ float32x4_t min_vec, max_vec, magnitude_vec;
+ float32x4_t real_abs, imag_abs;
+ for (number = 0; number < quarter_points; number++) {
+ complex_vec = vld2q_f32(complexVectorPtr);
+
+ real_abs = vabsq_f32(complex_vec.val[0]);
+ imag_abs = vabsq_f32(complex_vec.val[1]);
+
+ min_vec = vminq_f32(real_abs, imag_abs);
+ max_vec = vmaxq_f32(real_abs, imag_abs);
+
+ // effective branch to choose coefficient pair.
+ comp0 = vcgtq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
+ comp1 = vcleq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
+
+ // and 0s or 1s with coefficients from previous effective branch
+ a_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)a_high),
+ vandq_s32((int32x4_t)comp1, (int32x4_t)a_low));
+ b_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)b_high),
+ vandq_s32((int32x4_t)comp1, (int32x4_t)b_low));
+
+ // coefficients chosen, do the weighted sum
+ min_vec = vmulq_f32(min_vec, b_vec);
+ max_vec = vmulq_f32(max_vec, a_vec);
+
+ magnitude_vec = vaddq_f32(min_vec, max_vec);
+ vst1q_f32(magnitudeVectorPtr, magnitude_vec);
+
+ complexVectorPtr += 8;
+ magnitudeVectorPtr += 4;
+ }
+
+ for (number = quarter_points * 4; number < num_points; number++) {
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_ORC
-extern void
-volk_32fc_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector,
- unsigned int num_points);
+extern void volk_32fc_magnitude_32f_a_orc_impl(float* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points);
-static inline void
-volk_32fc_magnitude_32f_u_orc(float* magnitudeVector, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_magnitude_32f_u_orc(float* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- volk_32fc_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, num_points);
+ volk_32fc_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, num_points);
}
#endif /* LV_HAVE_ORC */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_magnitude_squared_32f(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_32fc_magnitude_squared_32f(float* magnitudeVector, const lv_32fc_t*
+ * complexVector, unsigned int num_points) \endcode
*
* \b Inputs
* \li complexVector: The complex input vector.
#define INCLUDED_volk_32fc_magnitude_squared_32f_u_H
#include <inttypes.h>
-#include <stdio.h>
#include <math.h>
+#include <stdio.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
#include <volk/volk_avx_intrinsics.h>
-static inline void
-volk_32fc_magnitude_squared_32f_u_avx(float* magnitudeVector, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_magnitude_squared_32f_u_avx(float* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- const float* complexVectorPtr = (float*) complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
-
- __m256 cplxValue1, cplxValue2, result;
-
- for(; number < eighthPoints; number++){
- cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
- cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8);
- result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2);
- _mm256_storeu_ps(magnitudeVectorPtr, result);
-
- complexVectorPtr += 16;
- magnitudeVectorPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- float val1Real = *complexVectorPtr++;
- float val1Imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
- }
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m256 cplxValue1, cplxValue2, result;
+
+ for (; number < eighthPoints; number++) {
+ cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+ cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8);
+ result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2);
+ _mm256_storeu_ps(magnitudeVectorPtr, result);
+
+ complexVectorPtr += 16;
+ magnitudeVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
}
#endif /* LV_HAVE_AVX */
#include <pmmintrin.h>
#include <volk/volk_sse3_intrinsics.h>
-static inline void
-volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- const float* complexVectorPtr = (float*) complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
-
- __m128 cplxValue1, cplxValue2, result;
- for(; number < quarterPoints; number++){
- cplxValue1 = _mm_loadu_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- cplxValue2 = _mm_loadu_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2);
- _mm_storeu_ps(magnitudeVectorPtr, result);
- magnitudeVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- float val1Real = *complexVectorPtr++;
- float val1Imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
- }
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+ for (; number < quarterPoints; number++) {
+ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2);
+ _mm_storeu_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
#include <volk/volk_sse_intrinsics.h>
+#include <xmmintrin.h>
-static inline void
-volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- const float* complexVectorPtr = (float*) complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
- __m128 cplxValue1, cplxValue2, result;
+ __m128 cplxValue1, cplxValue2, result;
- for(; number < quarterPoints; number++){
- cplxValue1 = _mm_loadu_ps(complexVectorPtr);
- complexVectorPtr += 4;
+ for (; number < quarterPoints; number++) {
+ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
- cplxValue2 = _mm_loadu_ps(complexVectorPtr);
- complexVectorPtr += 4;
+ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
- result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2);
- _mm_storeu_ps(magnitudeVectorPtr, result);
- magnitudeVectorPtr += 4;
- }
+ result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2);
+ _mm_storeu_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- float val1Real = *complexVectorPtr++;
- float val1Imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32fc_magnitude_squared_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_magnitude_squared_32f_generic(float* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- const float* complexVectorPtr = (float*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
- unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- const float real = *complexVectorPtr++;
- const float imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = (real*real) + (imag*imag);
- }
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for (number = 0; number < num_points; number++) {
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (real * real) + (imag * imag);
+ }
}
#endif /* LV_HAVE_GENERIC */
-
#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_a_H
#define INCLUDED_volk_32fc_magnitude_squared_32f_a_H
#include <inttypes.h>
-#include <stdio.h>
#include <math.h>
+#include <stdio.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
#include <volk/volk_avx_intrinsics.h>
-static inline void
-volk_32fc_magnitude_squared_32f_a_avx(float* magnitudeVector, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_magnitude_squared_32f_a_avx(float* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- const float* complexVectorPtr = (float*) complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
-
- __m256 cplxValue1, cplxValue2, result;
- for(; number < eighthPoints; number++){
- cplxValue1 = _mm256_load_ps(complexVectorPtr);
- complexVectorPtr += 8;
-
- cplxValue2 = _mm256_load_ps(complexVectorPtr);
- complexVectorPtr += 8;
-
- result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2);
- _mm256_store_ps(magnitudeVectorPtr, result);
- magnitudeVectorPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- float val1Real = *complexVectorPtr++;
- float val1Imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
- }
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m256 cplxValue1, cplxValue2, result;
+ for (; number < eighthPoints; number++) {
+ cplxValue1 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ cplxValue2 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2);
+ _mm256_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
}
#endif /* LV_HAVE_AVX */
#include <pmmintrin.h>
#include <volk/volk_sse3_intrinsics.h>
-static inline void
-volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- const float* complexVectorPtr = (float*) complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
-
- __m128 cplxValue1, cplxValue2, result;
- for(; number < quarterPoints; number++){
- cplxValue1 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- cplxValue2 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2);
- _mm_store_ps(magnitudeVectorPtr, result);
- magnitudeVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- float val1Real = *complexVectorPtr++;
- float val1Imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
- }
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+ for (; number < quarterPoints; number++) {
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2);
+ _mm_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
#include <volk/volk_sse_intrinsics.h>
+#include <xmmintrin.h>
-static inline void
-volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- const float* complexVectorPtr = (float*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
-
- __m128 cplxValue1, cplxValue2, result;
- for(;number < quarterPoints; number++){
- cplxValue1 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- cplxValue2 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2);
- _mm_store_ps(magnitudeVectorPtr, result);
- magnitudeVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- float val1Real = *complexVectorPtr++;
- float val1Imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
- }
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+ for (; number < quarterPoints; number++) {
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2);
+ _mm_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_32fc_magnitude_squared_32f_neon(float* magnitudeVector, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_magnitude_squared_32f_neon(float* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- const float* complexVectorPtr = (float*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
-
- float32x4x2_t cmplx_val;
- float32x4_t result;
- for(;number < quarterPoints; number++){
- cmplx_val = vld2q_f32(complexVectorPtr);
- complexVectorPtr += 8;
-
- cmplx_val.val[0] = vmulq_f32(cmplx_val.val[0], cmplx_val.val[0]); // Square the values
- cmplx_val.val[1] = vmulq_f32(cmplx_val.val[1], cmplx_val.val[1]); // Square the values
-
- result = vaddq_f32(cmplx_val.val[0], cmplx_val.val[1]); // Add the I2 and Q2 values
-
- vst1q_f32(magnitudeVectorPtr, result);
- magnitudeVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- float val1Real = *complexVectorPtr++;
- float val1Imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
- }
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ float32x4x2_t cmplx_val;
+ float32x4_t result;
+ for (; number < quarterPoints; number++) {
+ cmplx_val = vld2q_f32(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ cmplx_val.val[0] =
+ vmulq_f32(cmplx_val.val[0], cmplx_val.val[0]); // Square the values
+ cmplx_val.val[1] =
+ vmulq_f32(cmplx_val.val[1], cmplx_val.val[1]); // Square the values
+
+ result =
+ vaddq_f32(cmplx_val.val[0], cmplx_val.val[1]); // Add the I2 and Q2 values
+
+ vst1q_f32(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector,
- unsigned int num_points)
+static inline void volk_32fc_magnitude_squared_32f_a_generic(
+ float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
{
- const float* complexVectorPtr = (float*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
- unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- const float real = *complexVectorPtr++;
- const float imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = (real*real) + (imag*imag);
- }
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for (number = 0; number < num_points; number++) {
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (real * real) + (imag * imag);
+ }
}
#endif /* LV_HAVE_GENERIC */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_s32f_atan2_32f(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points)
- * \endcode
+ * void volk_32fc_s32f_atan2_32f(float* outputVector, const lv_32fc_t* complexVector,
+ * const float normalizeFactor, unsigned int num_points) \endcode
*
* \b Inputs
- * \li inputVector: The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin).
- * \li normalizeFactor: The atan results are divided by this normalization factor.
- * \li num_points: The number of complex values in \p inputVector.
+ * \li inputVector: The byte-aligned input vector containing interleaved IQ data (I = cos,
+ * Q = sin). \li normalizeFactor: The atan results are divided by this normalization
+ * factor. \li num_points: The number of complex values in \p inputVector.
*
* \b Outputs
* \li outputVector: The vector where the results will be stored.
#define INCLUDED_volk_32fc_s32f_atan2_32f_a_H
#include <inttypes.h>
-#include <stdio.h>
#include <math.h>
+#include <stdio.h>
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
#include <simdmath.h>
#endif /* LV_HAVE_LIB_SIMDMATH */
-static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){
- const float* complexVectorPtr = (float*)complexVector;
- float* outPtr = outputVector;
+static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector,
+ const lv_32fc_t* complexVector,
+ const float normalizeFactor,
+ unsigned int num_points)
+{
+ const float* complexVectorPtr = (float*)complexVector;
+ float* outPtr = outputVector;
- unsigned int number = 0;
- const float invNormalizeFactor = 1.0 / normalizeFactor;
+ unsigned int number = 0;
+ const float invNormalizeFactor = 1.0 / normalizeFactor;
#ifdef LV_HAVE_LIB_SIMDMATH
- const unsigned int quarterPoints = num_points / 4;
- __m128 testVector = _mm_set_ps1(2*M_PI);
- __m128 correctVector = _mm_set_ps1(M_PI);
- __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
- __m128 phase;
- __m128 complex1, complex2, iValue, qValue;
- __m128 keepMask;
-
- for (; number < quarterPoints; number++) {
- // Load IQ data:
- complex1 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
- complex2 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
- // Deinterleave IQ data:
- iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0));
- qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1));
- // Arctan to get phase:
- phase = atan2f4(qValue, iValue);
- // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
- // Compare to 2pi:
- keepMask = _mm_cmpneq_ps(phase,testVector);
- phase = _mm_blendv_ps(correctVector, phase, keepMask);
- // done with above correction.
- phase = _mm_mul_ps(phase, vNormalizeFactor);
- _mm_store_ps((float*)outPtr, phase);
- outPtr += 4;
- }
- number = quarterPoints * 4;
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 testVector = _mm_set_ps1(2 * M_PI);
+ __m128 correctVector = _mm_set_ps1(M_PI);
+ __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
+ __m128 phase;
+ __m128 complex1, complex2, iValue, qValue;
+ __m128 keepMask;
+
+ for (; number < quarterPoints; number++) {
+ // Load IQ data:
+ complex1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+ complex2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+ // Deinterleave IQ data:
+ iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2, 0, 2, 0));
+ qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3, 1, 3, 1));
+ // Arctan to get phase:
+ phase = atan2f4(qValue, iValue);
+ // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
+ // Compare to 2pi:
+ keepMask = _mm_cmpneq_ps(phase, testVector);
+ phase = _mm_blendv_ps(correctVector, phase, keepMask);
+ // done with above correction.
+ phase = _mm_mul_ps(phase, vNormalizeFactor);
+ _mm_store_ps((float*)outPtr, phase);
+ outPtr += 4;
+ }
+ number = quarterPoints * 4;
#endif /* LV_HAVE_SIMDMATH_H */
- for (; number < num_points; number++) {
- const float real = *complexVectorPtr++;
- const float imag = *complexVectorPtr++;
- *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
- }
+ for (; number < num_points; number++) {
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+ }
}
#endif /* LV_HAVE_SSE4_1 */
#include <simdmath.h>
#endif /* LV_HAVE_LIB_SIMDMATH */
-static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){
- const float* complexVectorPtr = (float*)complexVector;
- float* outPtr = outputVector;
+static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector,
+ const lv_32fc_t* complexVector,
+ const float normalizeFactor,
+ unsigned int num_points)
+{
+ const float* complexVectorPtr = (float*)complexVector;
+ float* outPtr = outputVector;
- unsigned int number = 0;
- const float invNormalizeFactor = 1.0 / normalizeFactor;
+ unsigned int number = 0;
+ const float invNormalizeFactor = 1.0 / normalizeFactor;
#ifdef LV_HAVE_LIB_SIMDMATH
- const unsigned int quarterPoints = num_points / 4;
- __m128 testVector = _mm_set_ps1(2*M_PI);
- __m128 correctVector = _mm_set_ps1(M_PI);
- __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
- __m128 phase;
- __m128 complex1, complex2, iValue, qValue;
- __m128 mask;
- __m128 keepMask;
-
- for (; number < quarterPoints; number++) {
- // Load IQ data:
- complex1 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
- complex2 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
- // Deinterleave IQ data:
- iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0));
- qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1));
- // Arctan to get phase:
- phase = atan2f4(qValue, iValue);
- // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
- // Compare to 2pi:
- keepMask = _mm_cmpneq_ps(phase,testVector);
- phase = _mm_and_ps(phase, keepMask);
- mask = _mm_andnot_ps(keepMask, correctVector);
- phase = _mm_or_ps(phase, mask);
- // done with above correction.
- phase = _mm_mul_ps(phase, vNormalizeFactor);
- _mm_store_ps((float*)outPtr, phase);
- outPtr += 4;
- }
- number = quarterPoints * 4;
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 testVector = _mm_set_ps1(2 * M_PI);
+ __m128 correctVector = _mm_set_ps1(M_PI);
+ __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
+ __m128 phase;
+ __m128 complex1, complex2, iValue, qValue;
+ __m128 mask;
+ __m128 keepMask;
+
+ for (; number < quarterPoints; number++) {
+ // Load IQ data:
+ complex1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+ complex2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+ // Deinterleave IQ data:
+ iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2, 0, 2, 0));
+ qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3, 1, 3, 1));
+ // Arctan to get phase:
+ phase = atan2f4(qValue, iValue);
+ // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
+ // Compare to 2pi:
+ keepMask = _mm_cmpneq_ps(phase, testVector);
+ phase = _mm_and_ps(phase, keepMask);
+ mask = _mm_andnot_ps(keepMask, correctVector);
+ phase = _mm_or_ps(phase, mask);
+ // done with above correction.
+ phase = _mm_mul_ps(phase, vNormalizeFactor);
+ _mm_store_ps((float*)outPtr, phase);
+ outPtr += 4;
+ }
+ number = quarterPoints * 4;
#endif /* LV_HAVE_SIMDMATH_H */
- for (; number < num_points; number++) {
- const float real = *complexVectorPtr++;
- const float imag = *complexVectorPtr++;
- *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
- }
+ for (; number < num_points; number++) {
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){
- float* outPtr = outputVector;
- const float* inPtr = (float*)inputVector;
- const float invNormalizeFactor = 1.0 / normalizeFactor;
- unsigned int number;
- for ( number = 0; number < num_points; number++) {
- const float real = *inPtr++;
- const float imag = *inPtr++;
- *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
- }
+static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector,
+ const lv_32fc_t* inputVector,
+ const float normalizeFactor,
+ unsigned int num_points)
+{
+ float* outPtr = outputVector;
+ const float* inPtr = (float*)inputVector;
+ const float invNormalizeFactor = 1.0 / normalizeFactor;
+ unsigned int number;
+ for (number = 0; number < num_points; number++) {
+ const float real = *inPtr++;
+ const float imag = *inPtr++;
+ *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+ }
}
#endif /* LV_HAVE_GENERIC */
-
-
#endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a_H */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_s32f_deinterleave_real_16i(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points)
- * \endcode
+ * void volk_32fc_s32f_deinterleave_real_16i(int16_t* iBuffer, const lv_32fc_t*
+ * complexVector, const float scalar, unsigned int num_points) \endcode
*
* \b Inputs
* \li complexVector: The complex input vector.
#ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H
#define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H
-#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void
-volk_32fc_s32f_deinterleave_real_16i_a_avx2(int16_t* iBuffer, const lv_32fc_t* complexVector,
- const float scalar, unsigned int num_points)
+volk_32fc_s32f_deinterleave_real_16i_a_avx2(int16_t* iBuffer,
+ const lv_32fc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- const float* complexVectorPtr = (float*)complexVector;
- int16_t* iBufferPtr = iBuffer;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- __m256 vScalar = _mm256_set1_ps(scalar);
+ const float* complexVectorPtr = (float*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
- __m256 cplxValue1, cplxValue2, iValue;
- __m256i a;
- __m128i b;
+ __m256 vScalar = _mm256_set1_ps(scalar);
- __m256i idx = _mm256_set_epi32(3,3,3,3,5,1,4,0);
+ __m256 cplxValue1, cplxValue2, iValue;
+ __m256i a;
+ __m128i b;
- for(;number < eighthPoints; number++){
- cplxValue1 = _mm256_load_ps(complexVectorPtr);
- complexVectorPtr += 8;
+ __m256i idx = _mm256_set_epi32(3, 3, 3, 3, 5, 1, 4, 0);
- cplxValue2 = _mm256_load_ps(complexVectorPtr);
- complexVectorPtr += 8;
+ for (; number < eighthPoints; number++) {
+ cplxValue1 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
- // Arrange in i1i2i3i4 format
- iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ cplxValue2 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
- iValue = _mm256_mul_ps(iValue, vScalar);
+ // Arrange in i1i2i3i4 format
+ iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
- iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO);
- a = _mm256_cvtps_epi32(iValue);
- a = _mm256_packs_epi32(a,a);
- a = _mm256_permutevar8x32_epi32(a,idx);
- b = _mm256_extracti128_si256(a,0);
+ iValue = _mm256_mul_ps(iValue, vScalar);
- _mm_store_si128((__m128i*)iBufferPtr,b);
- iBufferPtr += 8;
+ iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO);
+ a = _mm256_cvtps_epi32(iValue);
+ a = _mm256_packs_epi32(a, a);
+ a = _mm256_permutevar8x32_epi32(a, idx);
+ b = _mm256_extracti128_si256(a, 0);
- }
+ _mm_store_si128((__m128i*)iBufferPtr, b);
+ iBufferPtr += 8;
+ }
- number = eighthPoints * 8;
- iBufferPtr = &iBuffer[number];
- for(; number < num_points; number++){
- *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
- complexVectorPtr++;
- }
+ number = eighthPoints * 8;
+ iBufferPtr = &iBuffer[number];
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+ complexVectorPtr++;
+ }
}
#include <xmmintrin.h>
static inline void
-volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer, const lv_32fc_t* complexVector,
- const float scalar, unsigned int num_points)
+volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer,
+ const lv_32fc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- const float* complexVectorPtr = (float*)complexVector;
- int16_t* iBufferPtr = iBuffer;
+ const float* complexVectorPtr = (float*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
- __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 vScalar = _mm_set_ps1(scalar);
- __m128 cplxValue1, cplxValue2, iValue;
+ __m128 cplxValue1, cplxValue2, iValue;
- __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
- for(;number < quarterPoints; number++){
- cplxValue1 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
+ for (; number < quarterPoints; number++) {
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
- cplxValue2 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
- // Arrange in i1i2i3i4 format
- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
- iValue = _mm_mul_ps(iValue, vScalar);
+ iValue = _mm_mul_ps(iValue, vScalar);
- _mm_store_ps(floatBuffer, iValue);
- *iBufferPtr++ = (int16_t)(floatBuffer[0]);
- *iBufferPtr++ = (int16_t)(floatBuffer[1]);
- *iBufferPtr++ = (int16_t)(floatBuffer[2]);
- *iBufferPtr++ = (int16_t)(floatBuffer[3]);
- }
+ _mm_store_ps(floatBuffer, iValue);
+ *iBufferPtr++ = (int16_t)(floatBuffer[0]);
+ *iBufferPtr++ = (int16_t)(floatBuffer[1]);
+ *iBufferPtr++ = (int16_t)(floatBuffer[2]);
+ *iBufferPtr++ = (int16_t)(floatBuffer[3]);
+ }
- number = quarterPoints * 4;
- iBufferPtr = &iBuffer[number];
- for(; number < num_points; number++){
- *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
- complexVectorPtr++;
- }
+ number = quarterPoints * 4;
+ iBufferPtr = &iBuffer[number];
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
static inline void
-volk_32fc_s32f_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_32fc_t* complexVector,
- const float scalar, unsigned int num_points)
+volk_32fc_s32f_deinterleave_real_16i_generic(int16_t* iBuffer,
+ const lv_32fc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- const float* complexVectorPtr = (float*)complexVector;
- int16_t* iBufferPtr = iBuffer;
- unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
- complexVectorPtr++;
- }
+ const float* complexVectorPtr = (float*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ unsigned int number = 0;
+ for (number = 0; number < num_points; number++) {
+ *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H
#define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H
-#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void
-volk_32fc_s32f_deinterleave_real_16i_u_avx2(int16_t* iBuffer, const lv_32fc_t* complexVector,
- const float scalar, unsigned int num_points)
+volk_32fc_s32f_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
+ const lv_32fc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- const float* complexVectorPtr = (float*)complexVector;
- int16_t* iBufferPtr = iBuffer;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- __m256 vScalar = _mm256_set1_ps(scalar);
+ const float* complexVectorPtr = (float*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
- __m256 cplxValue1, cplxValue2, iValue;
- __m256i a;
- __m128i b;
+ __m256 vScalar = _mm256_set1_ps(scalar);
- __m256i idx = _mm256_set_epi32(3,3,3,3,5,1,4,0);
+ __m256 cplxValue1, cplxValue2, iValue;
+ __m256i a;
+ __m128i b;
- for(;number < eighthPoints; number++){
- cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
- complexVectorPtr += 8;
+ __m256i idx = _mm256_set_epi32(3, 3, 3, 3, 5, 1, 4, 0);
- cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
- complexVectorPtr += 8;
+ for (; number < eighthPoints; number++) {
+ cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 8;
- // Arrange in i1i2i3i4 format
- iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 8;
- iValue = _mm256_mul_ps(iValue, vScalar);
+ // Arrange in i1i2i3i4 format
+ iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
- iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO);
- a = _mm256_cvtps_epi32(iValue);
- a = _mm256_packs_epi32(a,a);
- a = _mm256_permutevar8x32_epi32(a,idx);
- b = _mm256_extracti128_si256(a,0);
+ iValue = _mm256_mul_ps(iValue, vScalar);
- _mm_storeu_si128((__m128i*)iBufferPtr,b);
- iBufferPtr += 8;
+ iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO);
+ a = _mm256_cvtps_epi32(iValue);
+ a = _mm256_packs_epi32(a, a);
+ a = _mm256_permutevar8x32_epi32(a, idx);
+ b = _mm256_extracti128_si256(a, 0);
- }
+ _mm_storeu_si128((__m128i*)iBufferPtr, b);
+ iBufferPtr += 8;
+ }
- number = eighthPoints * 8;
- iBufferPtr = &iBuffer[number];
- for(; number < num_points; number++){
- *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
- complexVectorPtr++;
- }
+ number = eighthPoints * 8;
+ iBufferPtr = &iBuffer[number];
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_s32f_magnitude_16i(int16_t* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_32fc_s32f_magnitude_16i(int16_t* magnitudeVector, const lv_32fc_t*
+ * complexVector, unsigned int num_points) \endcode
*
* \b Inputs
* \li complexVector: The complex input vector.
#ifdef LV_HAVE_GENERIC
#include <volk/volk_common.h>
-static inline void
-volk_32fc_s32f_magnitude_16i_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32fc_s32f_magnitude_16i_generic(int16_t* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- const float* complexVectorPtr = (float*)complexVector;
- int16_t* magnitudeVectorPtr = magnitudeVector;
- unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- __VOLK_VOLATILE float real = *complexVectorPtr++;
- __VOLK_VOLATILE float imag = *complexVectorPtr++;
- real *= real;
- imag *= imag;
- *magnitudeVectorPtr++ = (int16_t)rintf(scalar*sqrtf(real + imag));
- }
+ const float* complexVectorPtr = (float*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for (number = 0; number < num_points; number++) {
+ __VOLK_VOLATILE float real = *complexVectorPtr++;
+ __VOLK_VOLATILE float imag = *complexVectorPtr++;
+ real *= real;
+ imag *= imag;
+ *magnitudeVectorPtr++ = (int16_t)rintf(scalar * sqrtf(real + imag));
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
#define INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
-#include <volk/volk_common.h>
#include <inttypes.h>
-#include <stdio.h>
#include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_32fc_s32f_magnitude_16i_a_avx2(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32fc_s32f_magnitude_16i_a_avx2(int16_t* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- const float* complexVectorPtr = (const float*)complexVector;
- int16_t* magnitudeVectorPtr = magnitudeVector;
+ const float* complexVectorPtr = (const float*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
- __m256 vScalar = _mm256_set1_ps(scalar);
- __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0);
- __m256 cplxValue1, cplxValue2, result;
- __m256i resultInt;
- __m128i resultShort;
+ __m256 vScalar = _mm256_set1_ps(scalar);
+ __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
+ __m256 cplxValue1, cplxValue2, result;
+ __m256i resultInt;
+ __m128i resultShort;
- for(;number < eighthPoints; number++){
- cplxValue1 = _mm256_load_ps(complexVectorPtr);
- complexVectorPtr += 8;
+ for (; number < eighthPoints; number++) {
+ cplxValue1 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
- cplxValue2 = _mm256_load_ps(complexVectorPtr);
- complexVectorPtr += 8;
+ cplxValue2 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
- result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+ result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
- result = _mm256_sqrt_ps(result);
+ result = _mm256_sqrt_ps(result);
- result = _mm256_mul_ps(result, vScalar);
+ result = _mm256_mul_ps(result, vScalar);
- resultInt = _mm256_cvtps_epi32(result);
- resultInt = _mm256_packs_epi32(resultInt, resultInt);
- resultInt = _mm256_permutevar8x32_epi32(resultInt, idx); //permute to compensate for shuffling in hadd and packs
- resultShort = _mm256_extracti128_si256(resultInt,0);
- _mm_store_si128((__m128i*)magnitudeVectorPtr,resultShort);
- magnitudeVectorPtr += 8;
- }
+ resultInt = _mm256_cvtps_epi32(result);
+ resultInt = _mm256_packs_epi32(resultInt, resultInt);
+ resultInt = _mm256_permutevar8x32_epi32(
+ resultInt, idx); // permute to compensate for shuffling in hadd and packs
+ resultShort = _mm256_extracti128_si256(resultInt, 0);
+ _mm_store_si128((__m128i*)magnitudeVectorPtr, resultShort);
+ magnitudeVectorPtr += 8;
+ }
- number = eighthPoints * 8;
- volk_32fc_s32f_magnitude_16i_generic(magnitudeVector+number, complexVector+number, scalar, num_points-number);
+ number = eighthPoints * 8;
+ volk_32fc_s32f_magnitude_16i_generic(
+ magnitudeVector + number, complexVector + number, scalar, num_points - number);
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
-static inline void
-volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- const float* complexVectorPtr = (const float*)complexVector;
- int16_t* magnitudeVectorPtr = magnitudeVector;
+ const float* complexVectorPtr = (const float*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
- __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 vScalar = _mm_set_ps1(scalar);
- __m128 cplxValue1, cplxValue2, result;
+ __m128 cplxValue1, cplxValue2, result;
- __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
- for(;number < quarterPoints; number++){
- cplxValue1 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
+ for (; number < quarterPoints; number++) {
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
- cplxValue2 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
- result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
- result = _mm_sqrt_ps(result);
+ result = _mm_sqrt_ps(result);
- result = _mm_mul_ps(result, vScalar);
+ result = _mm_mul_ps(result, vScalar);
- _mm_store_ps(floatBuffer, result);
- *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
- *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
- *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
- *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
- }
+ _mm_store_ps(floatBuffer, result);
+ *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
+ *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
+ *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
+ *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
+ }
- number = quarterPoints * 4;
- volk_32fc_s32f_magnitude_16i_generic(magnitudeVector+number, complexVector+number, scalar, num_points-number);
+ number = quarterPoints * 4;
+ volk_32fc_s32f_magnitude_16i_generic(
+ magnitudeVector + number, complexVector + number, scalar, num_points - number);
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- const float* complexVectorPtr = (const float*)complexVector;
- int16_t* magnitudeVectorPtr = magnitudeVector;
+ const float* complexVectorPtr = (const float*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
- __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 vScalar = _mm_set_ps1(scalar);
- __m128 cplxValue1, cplxValue2, result;
- __m128 iValue, qValue;
+ __m128 cplxValue1, cplxValue2, result;
+ __m128 iValue, qValue;
- __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
- for(;number < quarterPoints; number++){
- cplxValue1 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
+ for (; number < quarterPoints; number++) {
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
- cplxValue2 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
- // Arrange in i1i2i3i4 format
- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
- // Arrange in q1q2q3q4 format
- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
- __VOLK_VOLATILE __m128 iValue2 = _mm_mul_ps(iValue, iValue); // Square the I values
- __VOLK_VOLATILE __m128 qValue2 = _mm_mul_ps(qValue, qValue); // Square the Q Values
+ __VOLK_VOLATILE __m128 iValue2 =
+ _mm_mul_ps(iValue, iValue); // Square the I values
+ __VOLK_VOLATILE __m128 qValue2 =
+ _mm_mul_ps(qValue, qValue); // Square the Q Values
- result = _mm_add_ps(iValue2, qValue2); // Add the I2 and Q2 values
+ result = _mm_add_ps(iValue2, qValue2); // Add the I2 and Q2 values
- result = _mm_sqrt_ps(result);
+ result = _mm_sqrt_ps(result);
- result = _mm_mul_ps(result, vScalar);
+ result = _mm_mul_ps(result, vScalar);
- _mm_store_ps(floatBuffer, result);
- *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
- *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
- *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
- *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
- }
+ _mm_store_ps(floatBuffer, result);
+ *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
+ *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
+ *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
+ *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
+ }
- number = quarterPoints * 4;
- volk_32fc_s32f_magnitude_16i_generic(magnitudeVector+number, complexVector+number, scalar, num_points-number);
+ number = quarterPoints * 4;
+ volk_32fc_s32f_magnitude_16i_generic(
+ magnitudeVector + number, complexVector + number, scalar, num_points - number);
}
#endif /* LV_HAVE_SSE */
#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_u_H
#define INCLUDED_volk_32fc_s32f_magnitude_16i_u_H
-#include <volk/volk_common.h>
#include <inttypes.h>
-#include <stdio.h>
#include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_32fc_s32f_magnitude_16i_u_avx2(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32fc_s32f_magnitude_16i_u_avx2(int16_t* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
- const float* complexVectorPtr = (const float*)complexVector;
- int16_t* magnitudeVectorPtr = magnitudeVector;
+ const float* complexVectorPtr = (const float*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
- __m256 vScalar = _mm256_set1_ps(scalar);
- __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0);
- __m256 cplxValue1, cplxValue2, result;
- __m256i resultInt;
- __m128i resultShort;
+ __m256 vScalar = _mm256_set1_ps(scalar);
+ __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
+ __m256 cplxValue1, cplxValue2, result;
+ __m256i resultInt;
+ __m128i resultShort;
- for(;number < eighthPoints; number++){
- cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
- complexVectorPtr += 8;
+ for (; number < eighthPoints; number++) {
+ cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 8;
- cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
- complexVectorPtr += 8;
+ cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 8;
- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
- result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+ result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
- result = _mm256_sqrt_ps(result);
+ result = _mm256_sqrt_ps(result);
- result = _mm256_mul_ps(result, vScalar);
+ result = _mm256_mul_ps(result, vScalar);
- resultInt = _mm256_cvtps_epi32(result);
- resultInt = _mm256_packs_epi32(resultInt, resultInt);
- resultInt = _mm256_permutevar8x32_epi32(resultInt, idx); //permute to compensate for shuffling in hadd and packs
- resultShort = _mm256_extracti128_si256(resultInt,0);
- _mm_storeu_si128((__m128i*)magnitudeVectorPtr,resultShort);
- magnitudeVectorPtr += 8;
- }
+ resultInt = _mm256_cvtps_epi32(result);
+ resultInt = _mm256_packs_epi32(resultInt, resultInt);
+ resultInt = _mm256_permutevar8x32_epi32(
+ resultInt, idx); // permute to compensate for shuffling in hadd and packs
+ resultShort = _mm256_extracti128_si256(resultInt, 0);
+ _mm_storeu_si128((__m128i*)magnitudeVectorPtr, resultShort);
+ magnitudeVectorPtr += 8;
+ }
- number = eighthPoints * 8;
- volk_32fc_s32f_magnitude_16i_generic(magnitudeVector+number, complexVector+number, scalar, num_points-number);
+ number = eighthPoints * 8;
+ volk_32fc_s32f_magnitude_16i_generic(
+ magnitudeVector + number, complexVector + number, scalar, num_points - number);
}
#endif /* LV_HAVE_AVX2 */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_s32f_power_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points)
- * \endcode
+ * void volk_32fc_s32f_power_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
+ * float power, unsigned int num_points) \endcode
*
* \b Inputs
* \li aVector: The complex input vector.
#define INCLUDED_volk_32fc_s32f_power_32fc_a_H
#include <inttypes.h>
-#include <stdio.h>
#include <math.h>
+#include <stdio.h>
//! raise a complex float to a real float power
-static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp, const float power)
+static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp,
+ const float power)
{
- const float arg = power*atan2f(lv_creal(exp), lv_cimag(exp));
- const float mag = powf(lv_creal(exp)*lv_creal(exp) + lv_cimag(exp)*lv_cimag(exp), power/2);
- return mag*lv_cmake(-cosf(arg), sinf(arg));
+ const float arg = power * atan2f(lv_creal(exp), lv_cimag(exp));
+ const float mag =
+ powf(lv_creal(exp) * lv_creal(exp) + lv_cimag(exp) * lv_cimag(exp), power / 2);
+ return mag * lv_cmake(-cosf(arg), sinf(arg));
}
#ifdef LV_HAVE_SSE
#include <simdmath.h>
#endif /* LV_HAVE_LIB_SIMDMATH */
-static inline void
-volk_32fc_s32f_power_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const float power, unsigned int num_points)
+static inline void volk_32fc_s32f_power_32fc_a_sse(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const float power,
+ unsigned int num_points)
{
- unsigned int number = 0;
+ unsigned int number = 0;
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
#ifdef LV_HAVE_LIB_SIMDMATH
- const unsigned int quarterPoints = num_points / 4;
- __m128 vPower = _mm_set_ps1(power);
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 vPower = _mm_set_ps1(power);
- __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue;
- for(;number < quarterPoints; number++){
+ __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue;
+ for (; number < quarterPoints; number++) {
- cplxValue1 = _mm_load_ps((float*)aPtr);
- aPtr += 2;
+ cplxValue1 = _mm_load_ps((float*)aPtr);
+ aPtr += 2;
- cplxValue2 = _mm_load_ps((float*)aPtr);
- aPtr += 2;
+ cplxValue2 = _mm_load_ps((float*)aPtr);
+ aPtr += 2;
- // Convert to polar coordinates
+ // Convert to polar coordinates
- // Arrange in i1i2i3i4 format
- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
- // Arrange in q1q2q3q4 format
- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
- phase = atan2f4(qValue, iValue); // Calculate the Phase
+ phase = atan2f4(qValue, iValue); // Calculate the Phase
- magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values
+ magnitude = _mm_sqrt_ps(
+ _mm_add_ps(_mm_mul_ps(iValue, iValue),
+ _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square
+ // rooting the added I2 and Q2 values
- // Now calculate the power of the polar coordinate data
- magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power
+ // Now calculate the power of the polar coordinate data
+ magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power
- phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power
+ phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power
- // Convert back to cartesian coordinates
- iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude
- qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude
+ // Convert back to cartesian coordinates
+ iValue = _mm_mul_ps(cosf4(phase),
+ magnitude); // Multiply the cos of the phase by the magnitude
+ qValue = _mm_mul_ps(sinf4(phase),
+ magnitude); // Multiply the sin of the phase by the magnitude
- cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values
- cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values
+ cplxValue1 =
+ _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values
+ cplxValue2 =
+ _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values
- _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container
+ _mm_store_ps((float*)cPtr,
+ cplxValue1); // Store the results back into the C container
- cPtr += 2;
+ cPtr += 2;
- _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container
+ _mm_store_ps((float*)cPtr,
+ cplxValue2); // Store the results back into the C container
- cPtr += 2;
- }
+ cPtr += 2;
+ }
- number = quarterPoints * 4;
+ number = quarterPoints * 4;
#endif /* LV_HAVE_LIB_SIMDMATH */
- for(;number < num_points; number++){
- *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power);
- }
+ for (; number < num_points; number++) {
+ *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32fc_s32f_power_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const float power, unsigned int num_points)
+static inline void volk_32fc_s32f_power_32fc_generic(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const float power,
+ unsigned int num_points)
{
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- unsigned int number = 0;
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power);
- }
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power);
+ }
}
#endif /* LV_HAVE_GENERIC */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_s32f_power_spectrum_32f(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points)
- * \endcode
+ * void volk_32fc_s32f_power_spectrum_32f(float* logPowerOutput, const lv_32fc_t*
+ * complexFFTInput, const float normalizationFactor, unsigned int num_points) \endcode
*
* \b Inputs
* \li complexFFTInput The complex data output from the FFT point.
- * \li normalizationFactor: This value is divided against all the input values before the power is calculated.
- * \li num_points: The number of fft data points.
+ * \li normalizationFactor: This value is divided against all the input values before the
+ * power is calculated. \li num_points: The number of fft data points.
*
* \b Outputs
* \li logPowerOutput: The 10.0 * log10(r*r + i*i) for each data point.
#define INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H
#include <inttypes.h>
-#include <stdio.h>
#include <math.h>
+#include <stdio.h>
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
#endif /* LV_HAVE_LIB_SIMDMATH */
static inline void
-volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput,
- const float normalizationFactor, unsigned int num_points)
+volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutput,
+ const lv_32fc_t* complexFFTInput,
+ const float normalizationFactor,
+ unsigned int num_points)
{
- const float* inputPtr = (const float*)complexFFTInput;
- float* destPtr = logPowerOutput;
- uint64_t number = 0;
- const float iNormalizationFactor = 1.0 / normalizationFactor;
+ const float* inputPtr = (const float*)complexFFTInput;
+ float* destPtr = logPowerOutput;
+ uint64_t number = 0;
+ const float iNormalizationFactor = 1.0 / normalizationFactor;
#ifdef LV_HAVE_LIB_SIMDMATH
- __m128 magScalar = _mm_set_ps1(10.0);
- magScalar = _mm_div_ps(magScalar, logf4(magScalar));
+ __m128 magScalar = _mm_set_ps1(10.0);
+ magScalar = _mm_div_ps(magScalar, logf4(magScalar));
- __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
+ __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
- __m128 power;
- __m128 input1, input2;
- const uint64_t quarterPoints = num_points / 4;
- for(;number < quarterPoints; number++){
- // Load the complex values
- input1 =_mm_load_ps(inputPtr);
- inputPtr += 4;
- input2 =_mm_load_ps(inputPtr);
- inputPtr += 4;
+ __m128 power;
+ __m128 input1, input2;
+ const uint64_t quarterPoints = num_points / 4;
+ for (; number < quarterPoints; number++) {
+ // Load the complex values
+ input1 = _mm_load_ps(inputPtr);
+ inputPtr += 4;
+ input2 = _mm_load_ps(inputPtr);
+ inputPtr += 4;
- // Apply the normalization factor
- input1 = _mm_mul_ps(input1, invNormalizationFactor);
- input2 = _mm_mul_ps(input2, invNormalizationFactor);
+ // Apply the normalization factor
+ input1 = _mm_mul_ps(input1, invNormalizationFactor);
+ input2 = _mm_mul_ps(input2, invNormalizationFactor);
- // Multiply each value by itself
- // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
- input1 = _mm_mul_ps(input1, input1);
- // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
- input2 = _mm_mul_ps(input2, input2);
+ // Multiply each value by itself
+ // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+ input1 = _mm_mul_ps(input1, input1);
+ // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+ input2 = _mm_mul_ps(input2, input2);
- // Horizontal add, to add (r*r) + (i*i) for each complex value
- // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
- power = _mm_hadd_ps(input1, input2);
+ // Horizontal add, to add (r*r) + (i*i) for each complex value
+ // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+ power = _mm_hadd_ps(input1, input2);
- // Calculate the natural log power
- power = logf4(power);
+ // Calculate the natural log power
+ power = logf4(power);
- // Convert to log10 and multiply by 10.0
- power = _mm_mul_ps(power, magScalar);
+ // Convert to log10 and multiply by 10.0
+ power = _mm_mul_ps(power, magScalar);
- // Store the floating point results
- _mm_store_ps(destPtr, power);
+ // Store the floating point results
+ _mm_store_ps(destPtr, power);
- destPtr += 4;
- }
+ destPtr += 4;
+ }
- number = quarterPoints*4;
+ number = quarterPoints * 4;
#endif /* LV_HAVE_LIB_SIMDMATH */
- // Calculate the FFT for any remaining points
-
- for(; number < num_points; number++){
- // Calculate dBm
- // 50 ohm load assumption
- // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
- // 75 ohm load assumption
- // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+ // Calculate the FFT for any remaining points
- const float real = *inputPtr++ * iNormalizationFactor;
- const float imag = *inputPtr++ * iNormalizationFactor;
+ for (; number < num_points; number++) {
+ // Calculate dBm
+ // 50 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+ // 75 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
- *destPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20);
+ const float real = *inputPtr++ * iNormalizationFactor;
+ const float imag = *inputPtr++ * iNormalizationFactor;
- destPtr++;
- }
+ *destPtr = 10.0 * log10f(((real * real) + (imag * imag)) + 1e-20);
+ destPtr++;
+ }
}
#endif /* LV_HAVE_SSE3 */
#include <volk/volk_neon_intrinsics.h>
static inline void
-volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points)
+volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput,
+ const lv_32fc_t* complexFFTInput,
+ const float normalizationFactor,
+ unsigned int num_points)
{
float* logPowerOutputPtr = logPowerOutput;
const lv_32fc_t* complexFFTInputPtr = complexFFTInput;
float32x4x2_t fft_vec;
float32x4_t log_pwr_vec;
float32x4_t mag_squared_vec;
-
+
const float inv_ln10_10 = 4.34294481903f; // 10.0/ln(10.)
-
- for(number = 0; number < quarter_points; number++) {
+
+ for (number = 0; number < quarter_points; number++) {
// Load
fft_vec = vld2q_f32((float*)complexFFTInputPtr);
// Prefetch next 4
- __VOLK_PREFETCH(complexFFTInputPtr+4);
+ __VOLK_PREFETCH(complexFFTInputPtr + 4);
// Normalize
fft_vec.val[0] = vmulq_n_f32(fft_vec.val[0], iNormalizationFactor);
fft_vec.val[1] = vmulq_n_f32(fft_vec.val[1], iNormalizationFactor);
// Store
vst1q_f32(logPowerOutputPtr, log_pwr_vec);
// Move pointers ahead
- complexFFTInputPtr+=4;
- logPowerOutputPtr+=4;
+ complexFFTInputPtr += 4;
+ logPowerOutputPtr += 4;
}
-
+
// deal with the rest
- for(number = quarter_points * 4; number < num_points; number++) {
+ for (number = quarter_points * 4; number < num_points; number++) {
const float real = lv_creal(*complexFFTInputPtr) * iNormalizationFactor;
const float imag = lv_cimag(*complexFFTInputPtr) * iNormalizationFactor;
*logPowerOutputPtr = 10.0 * log10f(((real * real) + (imag * imag)) + 1e-20);
#ifdef LV_HAVE_GENERIC
static inline void
-volk_32fc_s32f_power_spectrum_32f_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput,
- const float normalizationFactor, unsigned int num_points)
+volk_32fc_s32f_power_spectrum_32f_generic(float* logPowerOutput,
+ const lv_32fc_t* complexFFTInput,
+ const float normalizationFactor,
+ unsigned int num_points)
{
- // Calculate the Power of the complex point
- const float* inputPtr = (float*)complexFFTInput;
- float* realFFTDataPointsPtr = logPowerOutput;
- const float iNormalizationFactor = 1.0 / normalizationFactor;
- unsigned int point;
- for(point = 0; point < num_points; point++){
- // Calculate dBm
- // 50 ohm load assumption
- // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
- // 75 ohm load assumption
- // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
-
- const float real = *inputPtr++ * iNormalizationFactor;
- const float imag = *inputPtr++ * iNormalizationFactor;
-
- *realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20);
- realFFTDataPointsPtr++;
- }
+ // Calculate the Power of the complex point
+ const float* inputPtr = (float*)complexFFTInput;
+ float* realFFTDataPointsPtr = logPowerOutput;
+ const float iNormalizationFactor = 1.0 / normalizationFactor;
+ unsigned int point;
+ for (point = 0; point < num_points; point++) {
+ // Calculate dBm
+ // 50 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+ // 75 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+ const float real = *inputPtr++ * iNormalizationFactor;
+ const float imag = *inputPtr++ * iNormalizationFactor;
+
+ *realFFTDataPointsPtr = 10.0 * log10f(((real * real) + (imag * imag)) + 1e-20);
+ realFFTDataPointsPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_s32f_x2_power_spectral_density_32f(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points)
- * \endcode
+ * void volk_32fc_s32f_x2_power_spectral_density_32f(float* logPowerOutput, const
+ * lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned
+ * int num_points) \endcode
*
* \b Inputs
* \li complexFFTInput The complex data output from the FFT point.
- * \li normalizationFactor: This value is divided against all the input values before the power is calculated.
- * \li rbw: The resolution bandwidth of the fft spectrum
- * \li num_points: The number of fft data points.
+ * \li normalizationFactor: This value is divided against all the input values before the
+ * power is calculated. \li rbw: The resolution bandwidth of the fft spectrum \li
+ * num_points: The number of fft data points.
*
* \b Outputs
* \li logPowerOutput: The 10.0 * log10((r*r + i*i)/RBW) for each data point.
#define INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H
#include <inttypes.h>
-#include <stdio.h>
#include <math.h>
+#include <stdio.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
#endif /* LV_HAVE_LIB_SIMDMATH */
static inline void
-volk_32fc_s32f_x2_power_spectral_density_32f_a_avx(float* logPowerOutput, const lv_32fc_t* complexFFTInput,
- const float normalizationFactor, const float rbw,
+volk_32fc_s32f_x2_power_spectral_density_32f_a_avx(float* logPowerOutput,
+ const lv_32fc_t* complexFFTInput,
+ const float normalizationFactor,
+ const float rbw,
unsigned int num_points)
{
- const float* inputPtr = (const float*)complexFFTInput;
- float* destPtr = logPowerOutput;
- uint64_t number = 0;
- const float iRBW = 1.0 / rbw;
- const float iNormalizationFactor = 1.0 / normalizationFactor;
+ const float* inputPtr = (const float*)complexFFTInput;
+ float* destPtr = logPowerOutput;
+ uint64_t number = 0;
+ const float iRBW = 1.0 / rbw;
+ const float iNormalizationFactor = 1.0 / normalizationFactor;
#ifdef LV_HAVE_LIB_SIMDMATH
- __m256 magScalar = _mm256_set1_ps(10.0);
- magScalar = _mm256_div_ps(magScalar, logf4(magScalar));
+ __m256 magScalar = _mm256_set1_ps(10.0);
+ magScalar = _mm256_div_ps(magScalar, logf4(magScalar));
- __m256 invRBW = _mm256_set1_ps(iRBW);
+ __m256 invRBW = _mm256_set1_ps(iRBW);
- __m256 invNormalizationFactor = _mm256_set1_ps(iNormalizationFactor);
+ __m256 invNormalizationFactor = _mm256_set1_ps(iNormalizationFactor);
- __m256 power;
- __m256 input1, input2;
- const uint64_t eighthPoints = num_points / 8;
- for(;number < eighthPoints; number++){
- // Load the complex values
- input1 =_mm256_load_ps(inputPtr);
- inputPtr += 8;
- input2 =_mm256_load_ps(inputPtr);
- inputPtr += 8;
+ __m256 power;
+ __m256 input1, input2;
+ const uint64_t eighthPoints = num_points / 8;
+ for (; number < eighthPoints; number++) {
+ // Load the complex values
+ input1 = _mm256_load_ps(inputPtr);
+ inputPtr += 8;
+ input2 = _mm256_load_ps(inputPtr);
+ inputPtr += 8;
- // Apply the normalization factor
- input1 = _mm256_mul_ps(input1, invNormalizationFactor);
- input2 = _mm256_mul_ps(input2, invNormalizationFactor);
+ // Apply the normalization factor
+ input1 = _mm256_mul_ps(input1, invNormalizationFactor);
+ input2 = _mm256_mul_ps(input2, invNormalizationFactor);
- // Multiply each value by itself
- // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
- input1 = _mm256_mul_ps(input1, input1);
- // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
- input2 = _mm256_mul_ps(input2, input2);
+ // Multiply each value by itself
+ // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+ input1 = _mm256_mul_ps(input1, input1);
+ // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+ input2 = _mm256_mul_ps(input2, input2);
- // Horizontal add, to add (r*r) + (i*i) for each complex value
- // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
- inputVal1 = _mm256_permute2f128_ps(input1, input2, 0x20);
- inputVal2 = _mm256_permute2f128_ps(input1, input2, 0x31);
+ // Horizontal add, to add (r*r) + (i*i) for each complex value
+ // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+ inputVal1 = _mm256_permute2f128_ps(input1, input2, 0x20);
+ inputVal2 = _mm256_permute2f128_ps(input1, input2, 0x31);
- power = _mm256_hadd_ps(inputVal1, inputVal2);
+ power = _mm256_hadd_ps(inputVal1, inputVal2);
- // Divide by the rbw
- power = _mm256_mul_ps(power, invRBW);
+ // Divide by the rbw
+ power = _mm256_mul_ps(power, invRBW);
- // Calculate the natural log power
- power = logf4(power);
+ // Calculate the natural log power
+ power = logf4(power);
- // Convert to log10 and multiply by 10.0
- power = _mm256_mul_ps(power, magScalar);
+ // Convert to log10 and multiply by 10.0
+ power = _mm256_mul_ps(power, magScalar);
- // Store the floating point results
- _mm256_store_ps(destPtr, power);
+ // Store the floating point results
+ _mm256_store_ps(destPtr, power);
- destPtr += 8;
- }
+ destPtr += 8;
+ }
- number = eighthPoints*8;
+ number = eighthPoints * 8;
#endif /* LV_HAVE_LIB_SIMDMATH */
- // Calculate the FFT for any remaining points
- for(; number < num_points; number++){
- // Calculate dBm
- // 50 ohm load assumption
- // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
- // 75 ohm load assumption
- // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
-
- const float real = *inputPtr++ * iNormalizationFactor;
- const float imag = *inputPtr++ * iNormalizationFactor;
-
- *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
- destPtr++;
- }
-
+ // Calculate the FFT for any remaining points
+ for (; number < num_points; number++) {
+ // Calculate dBm
+ // 50 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+ // 75 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+ const float real = *inputPtr++ * iNormalizationFactor;
+ const float imag = *inputPtr++ * iNormalizationFactor;
+
+ *destPtr = 10.0 * log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
+ destPtr++;
+ }
}
#endif /* LV_HAVE_AVX */
#include <pmmintrin.h>
-
#ifdef LV_HAVE_LIB_SIMDMATH
#include <simdmath.h>
#endif /* LV_HAVE_LIB_SIMDMATH */
static inline void
-volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput,
- const float normalizationFactor, const float rbw,
+volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* logPowerOutput,
+ const lv_32fc_t* complexFFTInput,
+ const float normalizationFactor,
+ const float rbw,
unsigned int num_points)
{
- const float* inputPtr = (const float*)complexFFTInput;
- float* destPtr = logPowerOutput;
- uint64_t number = 0;
- const float iRBW = 1.0 / rbw;
- const float iNormalizationFactor = 1.0 / normalizationFactor;
+ const float* inputPtr = (const float*)complexFFTInput;
+ float* destPtr = logPowerOutput;
+ uint64_t number = 0;
+ const float iRBW = 1.0 / rbw;
+ const float iNormalizationFactor = 1.0 / normalizationFactor;
#ifdef LV_HAVE_LIB_SIMDMATH
- __m128 magScalar = _mm_set_ps1(10.0);
- magScalar = _mm_div_ps(magScalar, logf4(magScalar));
+ __m128 magScalar = _mm_set_ps1(10.0);
+ magScalar = _mm_div_ps(magScalar, logf4(magScalar));
- __m128 invRBW = _mm_set_ps1(iRBW);
+ __m128 invRBW = _mm_set_ps1(iRBW);
- __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
+ __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
- __m128 power;
- __m128 input1, input2;
- const uint64_t quarterPoints = num_points / 4;
- for(;number < quarterPoints; number++){
- // Load the complex values
- input1 =_mm_load_ps(inputPtr);
- inputPtr += 4;
- input2 =_mm_load_ps(inputPtr);
- inputPtr += 4;
+ __m128 power;
+ __m128 input1, input2;
+ const uint64_t quarterPoints = num_points / 4;
+ for (; number < quarterPoints; number++) {
+ // Load the complex values
+ input1 = _mm_load_ps(inputPtr);
+ inputPtr += 4;
+ input2 = _mm_load_ps(inputPtr);
+ inputPtr += 4;
- // Apply the normalization factor
- input1 = _mm_mul_ps(input1, invNormalizationFactor);
- input2 = _mm_mul_ps(input2, invNormalizationFactor);
+ // Apply the normalization factor
+ input1 = _mm_mul_ps(input1, invNormalizationFactor);
+ input2 = _mm_mul_ps(input2, invNormalizationFactor);
- // Multiply each value by itself
- // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
- input1 = _mm_mul_ps(input1, input1);
- // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
- input2 = _mm_mul_ps(input2, input2);
+ // Multiply each value by itself
+ // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+ input1 = _mm_mul_ps(input1, input1);
+ // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+ input2 = _mm_mul_ps(input2, input2);
- // Horizontal add, to add (r*r) + (i*i) for each complex value
- // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
- power = _mm_hadd_ps(input1, input2);
+ // Horizontal add, to add (r*r) + (i*i) for each complex value
+ // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+ power = _mm_hadd_ps(input1, input2);
- // Divide by the rbw
- power = _mm_mul_ps(power, invRBW);
+ // Divide by the rbw
+ power = _mm_mul_ps(power, invRBW);
- // Calculate the natural log power
- power = logf4(power);
+ // Calculate the natural log power
+ power = logf4(power);
- // Convert to log10 and multiply by 10.0
- power = _mm_mul_ps(power, magScalar);
+ // Convert to log10 and multiply by 10.0
+ power = _mm_mul_ps(power, magScalar);
- // Store the floating point results
- _mm_store_ps(destPtr, power);
+ // Store the floating point results
+ _mm_store_ps(destPtr, power);
- destPtr += 4;
- }
+ destPtr += 4;
+ }
- number = quarterPoints*4;
+ number = quarterPoints * 4;
#endif /* LV_HAVE_LIB_SIMDMATH */
- // Calculate the FFT for any remaining points
- for(; number < num_points; number++){
- // Calculate dBm
- // 50 ohm load assumption
- // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
- // 75 ohm load assumption
- // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
-
- const float real = *inputPtr++ * iNormalizationFactor;
- const float imag = *inputPtr++ * iNormalizationFactor;
-
- *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
- destPtr++;
- }
-
+ // Calculate the FFT for any remaining points
+ for (; number < num_points; number++) {
+ // Calculate dBm
+ // 50 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+ // 75 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+ const float real = *inputPtr++ * iNormalizationFactor;
+ const float imag = *inputPtr++ * iNormalizationFactor;
+
+ *destPtr = 10.0 * log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
+ destPtr++;
+ }
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_GENERIC
static inline void
-volk_32fc_s32f_x2_power_spectral_density_32f_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput,
- const float normalizationFactor, const float rbw,
+volk_32fc_s32f_x2_power_spectral_density_32f_generic(float* logPowerOutput,
+ const lv_32fc_t* complexFFTInput,
+ const float normalizationFactor,
+ const float rbw,
unsigned int num_points)
{
- // Calculate the Power of the complex point
- const float* inputPtr = (float*)complexFFTInput;
- float* realFFTDataPointsPtr = logPowerOutput;
- unsigned int point;
- const float invRBW = 1.0 / rbw;
- const float iNormalizationFactor = 1.0 / normalizationFactor;
-
- for(point = 0; point < num_points; point++){
- // Calculate dBm
- // 50 ohm load assumption
- // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
- // 75 ohm load assumption
- // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
-
- const float real = *inputPtr++ * iNormalizationFactor;
- const float imag = *inputPtr++ * iNormalizationFactor;
-
- *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW);
-
- realFFTDataPointsPtr++;
- }
+ // Calculate the Power of the complex point
+ const float* inputPtr = (float*)complexFFTInput;
+ float* realFFTDataPointsPtr = logPowerOutput;
+ unsigned int point;
+ const float invRBW = 1.0 / rbw;
+ const float iNormalizationFactor = 1.0 / normalizationFactor;
+
+ for (point = 0; point < num_points; point++) {
+ // Calculate dBm
+ // 50 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+ // 75 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+ const float real = *inputPtr++ * iNormalizationFactor;
+ const float imag = *inputPtr++ * iNormalizationFactor;
+
+ *realFFTDataPointsPtr =
+ 10.0 * log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW);
+
+ realFFTDataPointsPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_s32fc_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points);
- * \endcode
+ * void volk_32fc_s32fc_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
+ * lv_32fc_t scalar, unsigned int num_points); \endcode
*
* \b Inputs
* \li aVector: The input vector to be multiplied.
#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
#define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
+#include <float.h>
#include <inttypes.h>
#include <stdio.h>
#include <volk/volk_complex.h>
-#include <float.h>
#if LV_HAVE_AVX && LV_HAVE_FMA
#include <immintrin.h>
-static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t scalar,
+ unsigned int num_points)
+{
unsigned int number = 0;
unsigned int i = 0;
const unsigned int quarterPoints = num_points / 4;
yl = _mm256_set1_ps(lv_creal(scalar));
yh = _mm256_set1_ps(lv_cimag(scalar));
- for(;number < quarterPoints; number++){
- x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ for (; number < quarterPoints; number++) {
+ x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
- tmp1 = x;
+ tmp1 = x;
- x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
- tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
- z = _mm256_fmaddsub_ps(tmp1, yl,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z = _mm256_fmaddsub_ps(
+ tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
- _mm256_storeu_ps((float*)c,z); // Store the results back into the C container
+ _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
- a += 4;
- c += 4;
+ a += 4;
+ c += 4;
}
- for(i = num_points-isodd; i < num_points; i++) {
+ for (i = num_points - isodd; i < num_points; i++) {
*c++ = (*a++) * scalar;
}
-
}
#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t scalar,
+ unsigned int num_points)
+{
unsigned int number = 0;
unsigned int i = 0;
const unsigned int quarterPoints = num_points / 4;
yl = _mm256_set1_ps(lv_creal(scalar));
yh = _mm256_set1_ps(lv_cimag(scalar));
- for(;number < quarterPoints; number++){
- x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ for (; number < quarterPoints; number++) {
+ x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
- tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
- x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
- tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
- z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z = _mm256_addsub_ps(tmp1,
+ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
- _mm256_storeu_ps((float*)c,z); // Store the results back into the C container
+ _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
- a += 4;
- c += 4;
+ a += 4;
+ c += 4;
}
- for(i = num_points-isodd; i < num_points; i++) {
+ for (i = num_points - isodd; i < num_points; i++) {
*c++ = (*a++) * scalar;
}
-
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
-static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
- unsigned int number = 0;
+static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t scalar,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
const unsigned int halfPoints = num_points / 2;
__m128 x, yl, yh, z, tmp1, tmp2;
yl = _mm_set_ps1(lv_creal(scalar));
yh = _mm_set_ps1(lv_cimag(scalar));
- for(;number < halfPoints; number++){
+ for (; number < halfPoints; number++) {
- x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
- tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
- x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+ x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
- tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+ tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
- z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z = _mm_addsub_ps(tmp1,
+ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
- _mm_storeu_ps((float*)c,z); // Store the results back into the C container
+ _mm_storeu_ps((float*)c, z); // Store the results back into the C container
- a += 2;
- c += 2;
+ a += 2;
+ c += 2;
}
- if((num_points % 2) != 0) {
- *c = (*a) * scalar;
+ if ((num_points % 2) != 0) {
+ *c = (*a) * scalar;
}
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t scalar,
+ unsigned int num_points)
+{
lv_32fc_t* cPtr = cVector;
const lv_32fc_t* aPtr = aVector;
unsigned int number = num_points;
// unwrap loop
- while (number >= 8){
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- number -= 8;
+ while (number >= 8) {
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ number -= 8;
}
// clean up any remaining
while (number-- > 0)
- *cPtr++ = *aPtr++ * scalar;
+ *cPtr++ = *aPtr++ * scalar;
}
#endif /* LV_HAVE_GENERIC */
#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
#define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
+#include <float.h>
#include <inttypes.h>
#include <stdio.h>
#include <volk/volk_complex.h>
-#include <float.h>
#if LV_HAVE_AVX && LV_HAVE_FMA
#include <immintrin.h>
-static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t scalar,
+ unsigned int num_points)
+{
unsigned int number = 0;
unsigned int i = 0;
const unsigned int quarterPoints = num_points / 4;
yl = _mm256_set1_ps(lv_creal(scalar));
yh = _mm256_set1_ps(lv_cimag(scalar));
- for(;number < quarterPoints; number++){
- x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ for (; number < quarterPoints; number++) {
+ x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
- tmp1 = x;
+ tmp1 = x;
- x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
- tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
- z = _mm256_fmaddsub_ps(tmp1, yl,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z = _mm256_fmaddsub_ps(
+ tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
- _mm256_store_ps((float*)c,z); // Store the results back into the C container
+ _mm256_store_ps((float*)c, z); // Store the results back into the C container
- a += 4;
- c += 4;
+ a += 4;
+ c += 4;
}
- for(i = num_points-isodd; i < num_points; i++) {
+ for (i = num_points - isodd; i < num_points; i++) {
*c++ = (*a++) * scalar;
}
-
}
#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t scalar,
+ unsigned int num_points)
+{
unsigned int number = 0;
unsigned int i = 0;
const unsigned int quarterPoints = num_points / 4;
yl = _mm256_set1_ps(lv_creal(scalar));
yh = _mm256_set1_ps(lv_cimag(scalar));
- for(;number < quarterPoints; number++){
- x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ for (; number < quarterPoints; number++) {
+ x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
- tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
- x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
- tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
- z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z = _mm256_addsub_ps(tmp1,
+ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
- _mm256_store_ps((float*)c,z); // Store the results back into the C container
+ _mm256_store_ps((float*)c, z); // Store the results back into the C container
- a += 4;
- c += 4;
+ a += 4;
+ c += 4;
}
- for(i = num_points-isodd; i < num_points; i++) {
+ for (i = num_points - isodd; i < num_points; i++) {
*c++ = (*a++) * scalar;
}
-
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
-static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
- unsigned int number = 0;
+static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t scalar,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
const unsigned int halfPoints = num_points / 2;
__m128 x, yl, yh, z, tmp1, tmp2;
yl = _mm_set_ps1(lv_creal(scalar));
yh = _mm_set_ps1(lv_cimag(scalar));
- for(;number < halfPoints; number++){
+ for (; number < halfPoints; number++) {
- x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
- tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
- x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+ x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
- tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+ tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
- z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z = _mm_addsub_ps(tmp1,
+ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
- _mm_store_ps((float*)c,z); // Store the results back into the C container
+ _mm_store_ps((float*)c, z); // Store the results back into the C container
- a += 2;
- c += 2;
+ a += 2;
+ c += 2;
}
- if((num_points % 2) != 0) {
- *c = (*a) * scalar;
+ if ((num_points % 2) != 0) {
+ *c = (*a) * scalar;
}
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t scalar,
+ unsigned int num_points)
+{
lv_32fc_t* cPtr = cVector;
const lv_32fc_t* aPtr = aVector;
unsigned int number = num_points;
scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar);
scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1);
- for(number = 0; number < quarter_points; ++number) {
+ for (number = 0; number < quarter_points; ++number) {
a_val = vld2q_f32((float*)aPtr);
tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]);
tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]);
cPtr += 4;
}
- for(number = quarter_points*4; number < num_points; number++){
- *cPtr++ = *aPtr++ * scalar;
+ for (number = quarter_points * 4; number < num_points; number++) {
+ *cPtr++ = *aPtr++ * scalar;
}
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
-static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t scalar,
+ unsigned int num_points)
+{
lv_32fc_t* cPtr = cVector;
const lv_32fc_t* aPtr = aVector;
unsigned int number = num_points;
// unwrap loop
- while (number >= 8){
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- number -= 8;
+ while (number >= 8) {
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ number -= 8;
}
// clean up any remaining
while (number-- > 0)
- *cPtr++ = *aPtr++ * scalar;
+ *cPtr++ = *aPtr++ * scalar;
}
#endif /* LV_HAVE_GENERIC */
#define INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H
-#include <volk/volk_complex.h>
#include <stdio.h>
#include <volk/volk_32fc_s32fc_x2_rotator_32fc.h>
+#include <volk/volk_complex.h>
#ifdef LV_HAVE_GENERIC
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
- lv_32fc_t phase[1] = {lv_cmake(.3, 0.95393)};
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t phase_inc,
+ unsigned int num_points)
+{
+ lv_32fc_t phase[1] = { lv_cmake(.3, 0.95393) };
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
- const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
- volk_32fc_s32fc_x2_rotator_32fc_generic(outVector, inVector, phase_inc_n, phase, num_points);
-
+ const lv_32fc_t phase_inc_n =
+ phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+ volk_32fc_s32fc_x2_rotator_32fc_generic(
+ outVector, inVector, phase_inc_n, phase, num_points);
}
#endif /* LV_HAVE_GENERIC */
#include <arm_neon.h>
#include <volk/volk_neon_intrinsics.h>
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_neon(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
- lv_32fc_t phase[1] = {lv_cmake(.3, 0.95393)};
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_neon(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t phase_inc,
+ unsigned int num_points)
+{
+ lv_32fc_t phase[1] = { lv_cmake(.3, 0.95393) };
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
- const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
- volk_32fc_s32fc_x2_rotator_32fc_neon(outVector, inVector, phase_inc_n, phase, num_points);
-
+ const lv_32fc_t phase_inc_n =
+ phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+ volk_32fc_s32fc_x2_rotator_32fc_neon(
+ outVector, inVector, phase_inc_n, phase, num_points);
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
- lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t phase_inc,
+ unsigned int num_points)
+{
+ lv_32fc_t phase[1] = { lv_cmake(.3, .95393) };
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
- const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
- volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(outVector, inVector, phase_inc_n, phase, num_points);
-
+ const lv_32fc_t phase_inc_n =
+ phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+ volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(
+ outVector, inVector, phase_inc_n, phase, num_points);
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
- lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t phase_inc,
+ unsigned int num_points)
+{
+ lv_32fc_t phase[1] = { lv_cmake(.3, .95393) };
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
- const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
- volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(outVector, inVector, phase_inc_n, phase, num_points);
-
+ const lv_32fc_t phase_inc_n =
+ phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+ volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(
+ outVector, inVector, phase_inc_n, phase, num_points);
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
- lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t phase_inc,
+ unsigned int num_points)
+{
+ lv_32fc_t phase[1] = { lv_cmake(.3, .95393) };
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
- const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
- volk_32fc_s32fc_x2_rotator_32fc_a_avx(outVector, inVector, phase_inc_n, phase, num_points);
+ const lv_32fc_t phase_inc_n =
+ phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+ volk_32fc_s32fc_x2_rotator_32fc_a_avx(
+ outVector, inVector, phase_inc_n, phase, num_points);
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
- lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t phase_inc,
+ unsigned int num_points)
+{
+ lv_32fc_t phase[1] = { lv_cmake(.3, .95393) };
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
- const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
- volk_32fc_s32fc_x2_rotator_32fc_u_avx(outVector, inVector, phase_inc_n, phase, num_points);
+ const lv_32fc_t phase_inc_n =
+ phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+ volk_32fc_s32fc_x2_rotator_32fc_u_avx(
+ outVector, inVector, phase_inc_n, phase, num_points);
}
#endif /* LV_HAVE_AVX */
#if LV_HAVE_AVX && LV_HAVE_FMA
#include <immintrin.h>
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
- lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx_fma(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t phase_inc,
+ unsigned int num_points)
+{
+ lv_32fc_t phase[1] = { lv_cmake(.3, .95393) };
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
- const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
- volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(outVector, inVector, phase_inc_n, phase, num_points);
+ const lv_32fc_t phase_inc_n =
+ phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+ volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(
+ outVector, inVector, phase_inc_n, phase, num_points);
}
#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
#if LV_HAVE_AVX && LV_HAVE_FMA
#include <immintrin.h>
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
- lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx_fma(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t phase_inc,
+ unsigned int num_points)
+{
+ lv_32fc_t phase[1] = { lv_cmake(.3, .95393) };
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
- const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
- volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(outVector, inVector, phase_inc_n, phase, num_points);
+ const lv_32fc_t phase_inc_n =
+ phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+ volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(
+ outVector, inVector, phase_inc_n, phase, num_points);
}
#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_s32fc_x2_rotator_32fc(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
- * \endcode
+ * void volk_32fc_s32fc_x2_rotator_32fc(lv_32fc_t* outVector, const lv_32fc_t* inVector,
+ * const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points) \endcode
*
* \b Inputs
* \li inVector: Vector to be rotated.
* \li phase_inc: rotational velocity.
* \li phase: initial phase offset.
- * \li num_points: The number of values in inVector to be rotated and stored into outVector.
+ * \li num_points: The number of values in inVector to be rotated and stored into
+ * outVector.
*
* \b Outputs
* \li outVector: The vector where the results will be stored.
#define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
-#include <volk/volk_complex.h>
+#include <math.h>
#include <stdio.h>
#include <stdlib.h>
-#include <math.h>
+#include <volk/volk_complex.h>
#define ROTATOR_RELOAD 512
#ifdef LV_HAVE_GENERIC
-static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t phase_inc,
+ lv_32fc_t* phase,
+ unsigned int num_points)
+{
unsigned int i = 0;
int j = 0;
- for(i = 0; i < (unsigned int)(num_points/ROTATOR_RELOAD); ++i) {
- for(j = 0; j < ROTATOR_RELOAD; ++j) {
+ for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
+ for (j = 0; j < ROTATOR_RELOAD; ++j) {
*outVector++ = *inVector++ * (*phase);
(*phase) *= phase_inc;
}
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
}
- for(i = 0; i < num_points%ROTATOR_RELOAD; ++i) {
+ for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) {
*outVector++ = *inVector++ * (*phase);
(*phase) *= phase_inc;
}
- if(i){
+ if (i) {
// Make sure, we normalize phase on every call!
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
}
#include <arm_neon.h>
#include <volk/volk_neon_intrinsics.h>
-static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
+static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t phase_inc,
+ lv_32fc_t* phase,
+ unsigned int num_points)
{
lv_32fc_t* outputVectorPtr = outVector;
const lv_32fc_t* inputVectorPtr = inVector;
lv_32fc_t incr = 1;
- lv_32fc_t phasePtr[4] = {(*phase), (*phase), (*phase), (*phase)};
+ lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) };
float32x4x2_t input_vec;
float32x4x2_t output_vec;
-
+
unsigned int i = 0, j = 0;
const unsigned int quarter_points = num_points / 4;
-
- for(i = 0; i < 4; ++i) {
+
+ for (i = 0; i < 4; ++i) {
phasePtr[i] *= incr;
incr *= (phase_inc);
}
-
+
// Notice that incr has be incremented in the previous loop
- const lv_32fc_t incrPtr[4] = {incr, incr, incr, incr};
- const float32x4x2_t incr_vec = vld2q_f32((float*) incrPtr);
- float32x4x2_t phase_vec = vld2q_f32((float*) phasePtr);
-
- for(i = 0; i < (unsigned int)(quarter_points/ROTATOR_RELOAD); i++) {
- for(j = 0; j < ROTATOR_RELOAD; j++) {
- input_vec = vld2q_f32((float*) inputVectorPtr);
+ const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
+ const float32x4x2_t incr_vec = vld2q_f32((float*)incrPtr);
+ float32x4x2_t phase_vec = vld2q_f32((float*)phasePtr);
+
+ for (i = 0; i < (unsigned int)(quarter_points / ROTATOR_RELOAD); i++) {
+ for (j = 0; j < ROTATOR_RELOAD; j++) {
+ input_vec = vld2q_f32((float*)inputVectorPtr);
// Prefetch next one, speeds things up
- __VOLK_PREFETCH(inputVectorPtr+4);
+ __VOLK_PREFETCH(inputVectorPtr + 4);
// Rotate
output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
// Increase phase
phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
// Store output
vst2q_f32((float*)outputVectorPtr, output_vec);
-
- outputVectorPtr+=4;
- inputVectorPtr+=4;
+
+ outputVectorPtr += 4;
+ inputVectorPtr += 4;
}
// normalize phase so magnitude doesn't grow because of
// floating point rounding error
phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
}
-
- for(i = 0; i < quarter_points % ROTATOR_RELOAD; i++) {
- input_vec = vld2q_f32((float*) inputVectorPtr);
+
+ for (i = 0; i < quarter_points % ROTATOR_RELOAD; i++) {
+ input_vec = vld2q_f32((float*)inputVectorPtr);
// Prefetch next one, speeds things up
- __VOLK_PREFETCH(inputVectorPtr+4);
+ __VOLK_PREFETCH(inputVectorPtr + 4);
// Rotate
output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
// Increase phase
phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
// Store output
vst2q_f32((float*)outputVectorPtr, output_vec);
-
- outputVectorPtr+=4;
- inputVectorPtr+=4;
+
+ outputVectorPtr += 4;
+ inputVectorPtr += 4;
}
// if(i) == true means we looped above
if (i) {
}
// Store current phase
vst2q_f32((float*)phasePtr, phase_vec);
-
+
// Deal with the rest
- for(i = 0; i < num_points % 4; i++) {
+ for (i = 0; i < num_points % 4; i++) {
*outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
phasePtr[0] *= (phase_inc);
}
-
+
// For continious phase next time we need to call this function
(*phase) = phasePtr[0];
}
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
-static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t phase_inc,
+ lv_32fc_t* phase,
+ unsigned int num_points)
+{
lv_32fc_t* cPtr = outVector;
const lv_32fc_t* aPtr = inVector;
lv_32fc_t incr = 1;
- lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
+ lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
unsigned int i, j = 0;
- for(i = 0; i < 2; ++i) {
+ for (i = 0; i < 2; ++i) {
phase_Ptr[i] *= incr;
incr *= (phase_inc);
}
__m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
phase_Val = _mm_loadu_ps((float*)phase_Ptr);
- inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
+ inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
const unsigned int halfPoints = num_points / 2;
- for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) {
- for(j = 0; j < ROTATOR_RELOAD; ++j) {
+ for (i = 0; i < (unsigned int)(halfPoints / ROTATOR_RELOAD); i++) {
+ for (j = 0; j < ROTATOR_RELOAD; ++j) {
aVal = _mm_load_ps((float*)aPtr);
tmp2 = _mm_sqrt_ps(tmp1);
phase_Val = _mm_div_ps(phase_Val, tmp2);
}
- for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) {
+ for (i = 0; i < halfPoints % ROTATOR_RELOAD; ++i) {
aVal = _mm_load_ps((float*)aPtr);
yl = _mm_moveldup_ps(phase_Val);
}
(*phase) = phase_Ptr[0];
-
}
#endif /* LV_HAVE_SSE4_1 for aligned */
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
-static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t phase_inc,
+ lv_32fc_t* phase,
+ unsigned int num_points)
+{
lv_32fc_t* cPtr = outVector;
const lv_32fc_t* aPtr = inVector;
lv_32fc_t incr = 1;
- lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
+ lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
unsigned int i, j = 0;
- for(i = 0; i < 2; ++i) {
+ for (i = 0; i < 2; ++i) {
phase_Ptr[i] *= incr;
incr *= (phase_inc);
}
__m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
phase_Val = _mm_loadu_ps((float*)phase_Ptr);
- inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
+ inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
const unsigned int halfPoints = num_points / 2;
- for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) {
- for(j = 0; j < ROTATOR_RELOAD; ++j) {
+ for (i = 0; i < (unsigned int)(halfPoints / ROTATOR_RELOAD); i++) {
+ for (j = 0; j < ROTATOR_RELOAD; ++j) {
aVal = _mm_loadu_ps((float*)aPtr);
tmp2 = _mm_sqrt_ps(tmp1);
phase_Val = _mm_div_ps(phase_Val, tmp2);
}
- for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) {
+ for (i = 0; i < halfPoints % ROTATOR_RELOAD; ++i) {
aVal = _mm_loadu_ps((float*)aPtr);
yl = _mm_moveldup_ps(phase_Val);
}
(*phase) = phase_Ptr[0];
-
}
#endif /* LV_HAVE_SSE4_1 */
#include <immintrin.h>
#include <volk/volk_avx_intrinsics.h>
-static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t phase_inc,
+ lv_32fc_t* phase,
+ unsigned int num_points)
+{
lv_32fc_t* cPtr = outVector;
const lv_32fc_t* aPtr = inVector;
lv_32fc_t incr = lv_cmake(1.0, 0.0);
- lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
+ lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
unsigned int i, j = 0;
- for(i = 0; i < 4; ++i) {
+ for (i = 0; i < 4; ++i) {
phase_Ptr[i] *= incr;
incr *= (phase_inc);
}
__m256 aVal, phase_Val, z;
phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
-
- const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),
- lv_cimag(incr), lv_creal(incr),
- lv_cimag(incr), lv_creal(incr),
- lv_cimag(incr), lv_creal(incr));
+
+ const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
+ lv_creal(incr),
+ lv_cimag(incr),
+ lv_creal(incr),
+ lv_cimag(incr),
+ lv_creal(incr),
+ lv_cimag(incr),
+ lv_creal(incr));
const unsigned int fourthPoints = num_points / 4;
- for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
- for(j = 0; j < ROTATOR_RELOAD; ++j) {
+ for (i = 0; i < (unsigned int)(fourthPoints / ROTATOR_RELOAD); i++) {
+ for (j = 0; j < ROTATOR_RELOAD; ++j) {
aVal = _mm256_load_ps((float*)aPtr);
}
phase_Val = _mm256_normalize_ps(phase_Val);
}
-
- for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
+
+ for (i = 0; i < fourthPoints % ROTATOR_RELOAD; ++i) {
aVal = _mm256_load_ps((float*)aPtr);
z = _mm256_complexmul_ps(aVal, phase_Val);
if (i) {
phase_Val = _mm256_normalize_ps(phase_Val);
}
-
+
_mm256_storeu_ps((float*)phase_Ptr, phase_Val);
(*phase) = phase_Ptr[0];
- volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points%4);
+ volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
}
#endif /* LV_HAVE_AVX for aligned */
#include <immintrin.h>
#include <volk/volk_avx_intrinsics.h>
-static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t phase_inc,
+ lv_32fc_t* phase,
+ unsigned int num_points)
+{
lv_32fc_t* cPtr = outVector;
const lv_32fc_t* aPtr = inVector;
lv_32fc_t incr = lv_cmake(1.0, 0.0);
- lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
+ lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
unsigned int i, j = 0;
- for(i = 0; i < 4; ++i) {
+ for (i = 0; i < 4; ++i) {
phase_Ptr[i] *= incr;
incr *= (phase_inc);
}
__m256 aVal, phase_Val, z;
phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
-
- const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),
- lv_cimag(incr), lv_creal(incr),
- lv_cimag(incr), lv_creal(incr),
- lv_cimag(incr), lv_creal(incr));
-
+
+ const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
+ lv_creal(incr),
+ lv_cimag(incr),
+ lv_creal(incr),
+ lv_cimag(incr),
+ lv_creal(incr),
+ lv_cimag(incr),
+ lv_creal(incr));
+
const unsigned int fourthPoints = num_points / 4;
- for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); ++i) {
- for(j = 0; j < ROTATOR_RELOAD; ++j) {
+ for (i = 0; i < (unsigned int)(fourthPoints / ROTATOR_RELOAD); ++i) {
+ for (j = 0; j < ROTATOR_RELOAD; ++j) {
aVal = _mm256_loadu_ps((float*)aPtr);
-
+
z = _mm256_complexmul_ps(aVal, phase_Val);
phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
cPtr += 4;
}
phase_Val = _mm256_normalize_ps(phase_Val);
-
}
-
- for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
+
+ for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) {
aVal = _mm256_loadu_ps((float*)aPtr);
z = _mm256_complexmul_ps(aVal, phase_Val);
_mm256_storeu_ps((float*)phase_Ptr, phase_Val);
(*phase) = phase_Ptr[0];
- volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points%4);
+ volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
}
#endif /* LV_HAVE_AVX */
#if LV_HAVE_AVX && LV_HAVE_FMA
#include <immintrin.h>
-static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t phase_inc,
+ lv_32fc_t* phase,
+ unsigned int num_points)
+{
lv_32fc_t* cPtr = outVector;
const lv_32fc_t* aPtr = inVector;
lv_32fc_t incr = 1;
- __VOLK_ATTR_ALIGNED(32) lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
+ __VOLK_ATTR_ALIGNED(32)
+ lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
unsigned int i, j = 0;
- for(i = 0; i < 4; ++i) {
+ for (i = 0; i < 4; ++i) {
phase_Ptr[i] *= incr;
incr *= (phase_inc);
}
__m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
phase_Val = _mm256_load_ps((float*)phase_Ptr);
- inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
+ inc_Val = _mm256_set_ps(lv_cimag(incr),
+ lv_creal(incr),
+ lv_cimag(incr),
+ lv_creal(incr),
+ lv_cimag(incr),
+ lv_creal(incr),
+ lv_cimag(incr),
+ lv_creal(incr));
const unsigned int fourthPoints = num_points / 4;
- for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
- for(j = 0; j < ROTATOR_RELOAD; ++j) {
+ for (i = 0; i < (unsigned int)(fourthPoints / ROTATOR_RELOAD); i++) {
+ for (j = 0; j < ROTATOR_RELOAD; ++j) {
aVal = _mm256_load_ps((float*)aPtr);
tmp2 = _mm256_sqrt_ps(tmp1);
phase_Val = _mm256_div_ps(phase_Val, tmp2);
}
- for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
+ for (i = 0; i < fourthPoints % ROTATOR_RELOAD; ++i) {
aVal = _mm256_load_ps((float*)aPtr);
yl = _mm256_moveldup_ps(phase_Val);
}
_mm256_store_ps((float*)phase_Ptr, phase_Val);
- for(i = 0; i < num_points%4; ++i) {
+ for (i = 0; i < num_points % 4; ++i) {
*cPtr++ = *aPtr++ * phase_Ptr[0];
phase_Ptr[0] *= (phase_inc);
}
(*phase) = phase_Ptr[0];
-
}
#endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned*/
#if LV_HAVE_AVX && LV_HAVE_FMA
#include <immintrin.h>
-static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t phase_inc,
+ lv_32fc_t* phase,
+ unsigned int num_points)
+{
lv_32fc_t* cPtr = outVector;
const lv_32fc_t* aPtr = inVector;
lv_32fc_t incr = 1;
- lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
+ lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
unsigned int i, j = 0;
- for(i = 0; i < 4; ++i) {
+ for (i = 0; i < 4; ++i) {
phase_Ptr[i] *= incr;
incr *= (phase_inc);
}
__m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
- inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
+ inc_Val = _mm256_set_ps(lv_cimag(incr),
+ lv_creal(incr),
+ lv_cimag(incr),
+ lv_creal(incr),
+ lv_cimag(incr),
+ lv_creal(incr),
+ lv_cimag(incr),
+ lv_creal(incr));
const unsigned int fourthPoints = num_points / 4;
- for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
- for(j = 0; j < ROTATOR_RELOAD; ++j) {
+ for (i = 0; i < (unsigned int)(fourthPoints / ROTATOR_RELOAD); i++) {
+ for (j = 0; j < ROTATOR_RELOAD; ++j) {
aVal = _mm256_loadu_ps((float*)aPtr);
tmp2 = _mm256_sqrt_ps(tmp1);
phase_Val = _mm256_div_ps(phase_Val, tmp2);
}
- for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
+ for (i = 0; i < fourthPoints % ROTATOR_RELOAD; ++i) {
aVal = _mm256_loadu_ps((float*)aPtr);
yl = _mm256_moveldup_ps(phase_Val);
}
_mm256_storeu_ps((float*)phase_Ptr, phase_Val);
- for(i = 0; i < num_points%4; ++i) {
+ for (i = 0; i < num_points % 4; ++i) {
*cPtr++ = *aPtr++ * phase_Ptr[0];
phase_Ptr[0] *= (phase_inc);
}
(*phase) = phase_Ptr[0];
-
}
#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_x2_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points)
- * \endcode
+ * void volk_32fc_x2_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
+ * lv_32fc_t* bVector, unsigned int num_points) \endcode
*
* \b Inputs
* \li aVector: First vector of input points.
*
* \b Example
*
- * The follow example adds the increasing and decreasing vectors such that the result of every summation pair is 10
+ * The follow example adds the increasing and decreasing vectors such that the result of
+ * every summation pair is 10
*
* \code
* int N = 10;
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32fc_x2_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_add_32fc_u_avx(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- const lv_32fc_t* bPtr= bVector;
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr = bVector;
- __m256 aVal, bVal, cVal;
- for(;number < quarterPoints; number++){
+ __m256 aVal, bVal, cVal;
+ for (; number < quarterPoints; number++) {
- aVal = _mm256_loadu_ps((float *) aPtr);
- bVal = _mm256_loadu_ps((float *) bPtr);
+ aVal = _mm256_loadu_ps((float*)aPtr);
+ bVal = _mm256_loadu_ps((float*)bPtr);
- cVal = _mm256_add_ps(aVal, bVal);
+ cVal = _mm256_add_ps(aVal, bVal);
- _mm256_storeu_ps((float *) cPtr,cVal); // Store the results back into the C container
+ _mm256_storeu_ps((float*)cPtr,
+ cVal); // Store the results back into the C container
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_32fc_x2_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_add_32fc_a_avx(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- const lv_32fc_t* bPtr= bVector;
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr = bVector;
- __m256 aVal, bVal, cVal;
- for(;number < quarterPoints; number++){
+ __m256 aVal, bVal, cVal;
+ for (; number < quarterPoints; number++) {
- aVal = _mm256_load_ps((float*) aPtr);
- bVal = _mm256_load_ps((float*) bPtr);
+ aVal = _mm256_load_ps((float*)aPtr);
+ bVal = _mm256_load_ps((float*)bPtr);
- cVal = _mm256_add_ps(aVal, bVal);
+ cVal = _mm256_add_ps(aVal, bVal);
- _mm256_store_ps((float*) cPtr,cVal); // Store the results back into the C container
+ _mm256_store_ps((float*)cPtr,
+ cVal); // Store the results back into the C container
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32fc_x2_add_32fc_u_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_add_32fc_u_sse(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int halfPoints = num_points / 2;
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- const lv_32fc_t* bPtr= bVector;
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr = bVector;
- __m128 aVal, bVal, cVal;
- for(;number < halfPoints; number++){
+ __m128 aVal, bVal, cVal;
+ for (; number < halfPoints; number++) {
- aVal = _mm_loadu_ps((float *) aPtr);
- bVal = _mm_loadu_ps((float *) bPtr);
+ aVal = _mm_loadu_ps((float*)aPtr);
+ bVal = _mm_loadu_ps((float*)bPtr);
- cVal = _mm_add_ps(aVal, bVal);
+ cVal = _mm_add_ps(aVal, bVal);
- _mm_storeu_ps((float*) cPtr, cVal); // Store the results back into the C container
+ _mm_storeu_ps((float*)cPtr, cVal); // Store the results back into the C container
- aPtr += 2;
- bPtr += 2;
- cPtr += 2;
- }
+ aPtr += 2;
+ bPtr += 2;
+ cPtr += 2;
+ }
- number = halfPoints * 2;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
+ number = halfPoints * 2;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32fc_x2_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_add_32fc_generic(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- const lv_32fc_t* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32fc_x2_add_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_add_32fc_a_sse(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int halfPoints = num_points / 2;
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- const lv_32fc_t* bPtr= bVector;
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr = bVector;
- __m128 aVal, bVal, cVal;
- for(;number < halfPoints; number++){
- aVal = _mm_load_ps((float *) aPtr);
- bVal = _mm_load_ps((float *) bPtr);
+ __m128 aVal, bVal, cVal;
+ for (; number < halfPoints; number++) {
+ aVal = _mm_load_ps((float*)aPtr);
+ bVal = _mm_load_ps((float*)bPtr);
- cVal = _mm_add_ps(aVal, bVal);
+ cVal = _mm_add_ps(aVal, bVal);
- _mm_store_ps((float *) cPtr,cVal); // Store the results back into the C container
+ _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
- aPtr += 2;
- bPtr += 2;
- cPtr += 2;
- }
+ aPtr += 2;
+ bPtr += 2;
+ cPtr += 2;
+ }
- number = halfPoints * 2;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
+ number = halfPoints * 2;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_32fc_x2_add_32fc_u_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_add_32fc_u_neon(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int halfPoints = num_points / 2;
-
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- const lv_32fc_t* bPtr= bVector;
- float32x4_t aVal, bVal, cVal;
- for(number=0; number < halfPoints; number++){
- // Load in to NEON registers
- aVal = vld1q_f32((const float32_t*)(aPtr));
- bVal = vld1q_f32((const float32_t*)(bPtr));
- __VOLK_PREFETCH(aPtr+2);
- __VOLK_PREFETCH(bPtr+2);
-
- // vector add
- cVal = vaddq_f32(aVal, bVal);
- // Store the results back into the C container
- vst1q_f32((float*)(cPtr),cVal);
-
- aPtr += 2; // q uses quadwords, 4 lv_32fc_ts per vadd
- bPtr += 2;
- cPtr += 2;
- }
-
- number = halfPoints * 2; // should be = num_points
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr = bVector;
+ float32x4_t aVal, bVal, cVal;
+ for (number = 0; number < halfPoints; number++) {
+ // Load in to NEON registers
+ aVal = vld1q_f32((const float32_t*)(aPtr));
+ bVal = vld1q_f32((const float32_t*)(bPtr));
+ __VOLK_PREFETCH(aPtr + 2);
+ __VOLK_PREFETCH(bPtr + 2);
+
+ // vector add
+ cVal = vaddq_f32(aVal, bVal);
+ // Store the results back into the C container
+ vst1q_f32((float*)(cPtr), cVal);
+
+ aPtr += 2; // q uses quadwords, 4 lv_32fc_ts per vadd
+ bPtr += 2;
+ cPtr += 2;
+ }
+
+ number = halfPoints * 2; // should be = num_points
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_NEON */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_x2_conjugate_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
- * \endcode
+ * void volk_32fc_x2_conjugate_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input,
+ * const lv_32fc_t* taps, unsigned int num_points) \endcode
*
* \b Inputs
* \li input: vector of complex floats.
#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
-#include<volk/volk_complex.h>
+#include <volk/volk_complex.h>
#ifdef LV_HAVE_GENERIC
-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
- const unsigned int num_bytes = num_points*8;
+ const unsigned int num_bytes = num_points * 8;
- float * res = (float*) result;
- float * in = (float*) input;
- float * tp = (float*) taps;
- unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+ float* res = (float*)result;
+ float* in = (float*)input;
+ float* tp = (float*)taps;
+ unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
- float sum0[2] = {0,0};
- float sum1[2] = {0,0};
- unsigned int i = 0;
+ float sum0[2] = { 0, 0 };
+ float sum1[2] = { 0, 0 };
+ unsigned int i = 0;
- for(i = 0; i < n_2_ccomplex_blocks; ++i) {
- sum0[0] += in[0] * tp[0] + in[1] * tp[1];
- sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
- sum1[0] += in[2] * tp[2] + in[3] * tp[3];
- sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
+ for (i = 0; i < n_2_ccomplex_blocks; ++i) {
+ sum0[0] += in[0] * tp[0] + in[1] * tp[1];
+ sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] + in[3] * tp[3];
+ sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
- in += 4;
- tp += 4;
- }
+ in += 4;
+ tp += 4;
+ }
- res[0] = sum0[0] + sum1[0];
- res[1] = sum0[1] + sum1[1];
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
- if (num_bytes >> 3 & 1) {
- *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
- }
+ if (num_bytes >> 3 & 1) {
+ *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
+ }
}
#endif /*LV_HAVE_GENERIC*/
#include <immintrin.h>
static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_avx(lv_32fc_t* result,
- const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
{
- // Partial sums for indices i, i+1, i+2 and i+3.
- __m256 sum_a_mult_b_real = _mm256_setzero_ps();
- __m256 sum_a_mult_b_imag = _mm256_setzero_ps();
-
- for (long unsigned i = 0; i < (num_points & ~3u); i += 4) {
- /* Four complex elements a time are processed.
- * (ar + jâ‹…ai)*conj(br + jâ‹…bi) =
- * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
- */
+ // Partial sums for indices i, i+1, i+2 and i+3.
+ __m256 sum_a_mult_b_real = _mm256_setzero_ps();
+ __m256 sum_a_mult_b_imag = _mm256_setzero_ps();
+
+ for (long unsigned i = 0; i < (num_points & ~3u); i += 4) {
+ /* Four complex elements a time are processed.
+ * (ar + jâ‹…ai)*conj(br + jâ‹…bi) =
+ * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
+ */
+
+ /* Load input and taps, split and duplicate real und imaginary parts of taps.
+ * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
+ * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
+ * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
+ * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
+ */
+ __m256 a = _mm256_loadu_ps((const float*)&input[i]);
+ __m256 b = _mm256_loadu_ps((const float*)&taps[i]);
+ __m256 b_real = _mm256_moveldup_ps(b);
+ __m256 b_imag = _mm256_movehdup_ps(b);
+
+ // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
+ sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real));
+ // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
+ sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag));
+ }
- /* Load input and taps, split and duplicate real und imaginary parts of taps.
- * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
- * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
- * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
- * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
+ // Swap position of −ar⋅bi and ai⋅bi.
+ sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
+ // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums.
+ __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
+ /* Sum the four partial sums: Add high half of vector sum to the low one, i.e.
+ * s1 + s3 and s0 + s2 …
*/
- __m256 a = _mm256_loadu_ps((const float *) &input[i]);
- __m256 b = _mm256_loadu_ps((const float *) &taps[i]);
- __m256 b_real = _mm256_moveldup_ps(b);
- __m256 b_imag = _mm256_movehdup_ps(b);
-
- // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
- sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real));
- // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
- sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag));
- }
-
- // Swap position of −ar⋅bi and ai⋅bi.
- sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
- // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums.
- __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
- /* Sum the four partial sums: Add high half of vector sum to the low one, i.e.
- * s1 + s3 and s0 + s2 …
- */
- sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01));
- // … and now (s0 + s2) + (s1 + s3)
- sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2)));
- // Store result.
- __m128 lower = _mm256_extractf128_ps(sum, 0);
- _mm_storel_pi((__m64 *) result, lower);
-
- // Handle the last elements if num_points mod 4 is bigger than 0.
- for (long unsigned i = num_points & ~3u; i < num_points; ++i) {
- *result += lv_cmake(
- lv_creal(input[i]) * lv_creal(taps[i]) + lv_cimag(input[i]) * lv_cimag(taps[i]),
- lv_cimag(input[i]) * lv_creal(taps[i]) - lv_creal(input[i]) * lv_cimag(taps[i]));
- }
+ sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01));
+ // … and now (s0 + s2) + (s1 + s3)
+ sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2)));
+ // Store result.
+ __m128 lower = _mm256_extractf128_ps(sum, 0);
+ _mm_storel_pi((__m64*)result, lower);
+
+ // Handle the last elements if num_points mod 4 is bigger than 0.
+ for (long unsigned i = num_points & ~3u; i < num_points; ++i) {
+ *result += lv_cmake(lv_creal(input[i]) * lv_creal(taps[i]) +
+ lv_cimag(input[i]) * lv_cimag(taps[i]),
+ lv_cimag(input[i]) * lv_creal(taps[i]) -
+ lv_creal(input[i]) * lv_cimag(taps[i]));
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE3
-#include <xmmintrin.h>
#include <pmmintrin.h>
+#include <xmmintrin.h>
static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result,
- const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
{
- // Partial sums for indices i and i+1.
- __m128 sum_a_mult_b_real = _mm_setzero_ps();
- __m128 sum_a_mult_b_imag = _mm_setzero_ps();
-
- for (long unsigned i = 0; i < (num_points & ~1u); i += 2) {
- /* Two complex elements a time are processed.
- * (ar + jâ‹…ai)*conj(br + jâ‹…bi) =
- * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
- */
+ // Partial sums for indices i and i+1.
+ __m128 sum_a_mult_b_real = _mm_setzero_ps();
+ __m128 sum_a_mult_b_imag = _mm_setzero_ps();
+
+ for (long unsigned i = 0; i < (num_points & ~1u); i += 2) {
+ /* Two complex elements a time are processed.
+ * (ar + jâ‹…ai)*conj(br + jâ‹…bi) =
+ * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
+ */
+
+ /* Load input and taps, split and duplicate real und imaginary parts of taps.
+ * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
+ * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
+ * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
+ * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
+ */
+ __m128 a = _mm_loadu_ps((const float*)&input[i]);
+ __m128 b = _mm_loadu_ps((const float*)&taps[i]);
+ __m128 b_real = _mm_moveldup_ps(b);
+ __m128 b_imag = _mm_movehdup_ps(b);
+
+ // Add | aiâ‹…br,i+1 | arâ‹…br,i+1 | aiâ‹…br,i+0 | arâ‹…br,i+0 | to partial sum.
+ sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real));
+ // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
+ sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag));
+ }
- /* Load input and taps, split and duplicate real und imaginary parts of taps.
- * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
- * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
- * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
- * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
- */
- __m128 a = _mm_loadu_ps((const float *) &input[i]);
- __m128 b = _mm_loadu_ps((const float *) &taps[i]);
- __m128 b_real = _mm_moveldup_ps(b);
- __m128 b_imag = _mm_movehdup_ps(b);
-
- // Add | aiâ‹…br,i+1 | arâ‹…br,i+1 | aiâ‹…br,i+0 | arâ‹…br,i+0 | to partial sum.
- sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real));
- // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
- sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag));
- }
-
- // Swap position of −ar⋅bi and ai⋅bi.
- sum_a_mult_b_imag = _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag,
- _MM_SHUFFLE(2, 3, 0, 1));
- // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums.
- __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
- // Sum the two partial sums.
- sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2)));
- // Store result.
- _mm_storel_pi((__m64 *) result, sum);
-
- // Handle the last element if num_points mod 2 is 1.
- if (num_points & 1u) {
- *result += lv_cmake(
- lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) +
- lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]),
- lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) -
- lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1]));
- }
+ // Swap position of −ar⋅bi and ai⋅bi.
+ sum_a_mult_b_imag =
+ _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
+ // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums.
+ __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
+ // Sum the two partial sums.
+ sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2)));
+ // Store result.
+ _mm_storel_pi((__m64*)result, sum);
+
+ // Handle the last element if num_points mod 2 is 1.
+ if (num_points & 1u) {
+ *result += lv_cmake(
+ lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) +
+ lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]),
+ lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) -
+ lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1]));
+ }
}
#endif /*LV_HAVE_SSE3*/
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
unsigned int quarter_points = num_points / 4;
unsigned int number;
- lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
- lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+ lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
+ lv_32fc_t* b_ptr = (lv_32fc_t*)input;
// for 2-lane vectors, 1st lane holds the real part,
// 2nd lane holds the imaginary part
float32x4x2_t a_val, b_val, accumulator;
accumulator.val[0] = vdupq_n_f32(0);
accumulator.val[1] = vdupq_n_f32(0);
- for(number = 0; number < quarter_points; ++number) {
+ for (number = 0; number < quarter_points; ++number) {
a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
- __VOLK_PREFETCH(a_ptr+8);
- __VOLK_PREFETCH(b_ptr+8);
+ __VOLK_PREFETCH(a_ptr + 8);
+ __VOLK_PREFETCH(b_ptr + 8);
// do the first multiply
tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
*result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
// tail case
- for(number = quarter_points*4; number < num_points; ++number) {
- *result += (*a_ptr++) * lv_conj(*b_ptr++);
+ for (number = quarter_points * 4; number < num_points; ++number) {
+ *result += (*a_ptr++) * lv_conj(*b_ptr++);
}
*result = lv_conj(*result);
-
}
#endif /*LV_HAVE_NEON*/
#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
+#include <stdio.h>
#include <volk/volk_common.h>
-#include<volk/volk_complex.h>
-#include<stdio.h>
+#include <volk/volk_complex.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_avx(lv_32fc_t* result,
- const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
{
- // Partial sums for indices i, i+1, i+2 and i+3.
- __m256 sum_a_mult_b_real = _mm256_setzero_ps();
- __m256 sum_a_mult_b_imag = _mm256_setzero_ps();
-
- for (long unsigned i = 0; i < (num_points & ~3u); i += 4) {
- /* Four complex elements a time are processed.
- * (ar + jâ‹…ai)*conj(br + jâ‹…bi) =
- * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
- */
+ // Partial sums for indices i, i+1, i+2 and i+3.
+ __m256 sum_a_mult_b_real = _mm256_setzero_ps();
+ __m256 sum_a_mult_b_imag = _mm256_setzero_ps();
+
+ for (long unsigned i = 0; i < (num_points & ~3u); i += 4) {
+ /* Four complex elements a time are processed.
+ * (ar + jâ‹…ai)*conj(br + jâ‹…bi) =
+ * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
+ */
+
+ /* Load input and taps, split and duplicate real und imaginary parts of taps.
+ * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
+ * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
+ * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
+ * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
+ */
+ __m256 a = _mm256_load_ps((const float*)&input[i]);
+ __m256 b = _mm256_load_ps((const float*)&taps[i]);
+ __m256 b_real = _mm256_moveldup_ps(b);
+ __m256 b_imag = _mm256_movehdup_ps(b);
+
+ // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
+ sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real));
+ // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
+ sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag));
+ }
- /* Load input and taps, split and duplicate real und imaginary parts of taps.
- * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
- * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
- * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
- * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
+ // Swap position of −ar⋅bi and ai⋅bi.
+ sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
+ // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums.
+ __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
+ /* Sum the four partial sums: Add high half of vector sum to the low one, i.e.
+ * s1 + s3 and s0 + s2 …
*/
- __m256 a = _mm256_load_ps((const float *) &input[i]);
- __m256 b = _mm256_load_ps((const float *) &taps[i]);
- __m256 b_real = _mm256_moveldup_ps(b);
- __m256 b_imag = _mm256_movehdup_ps(b);
-
- // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
- sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real));
- // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
- sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag));
- }
-
- // Swap position of −ar⋅bi and ai⋅bi.
- sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
- // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums.
- __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
- /* Sum the four partial sums: Add high half of vector sum to the low one, i.e.
- * s1 + s3 and s0 + s2 …
- */
- sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01));
- // … and now (s0 + s2) + (s1 + s3)
- sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2)));
- // Store result.
- __m128 lower = _mm256_extractf128_ps(sum, 0);
- _mm_storel_pi((__m64 *) result, lower);
-
- // Handle the last elements if num_points mod 4 is bigger than 0.
- for (long unsigned i = num_points & ~3u; i < num_points; ++i) {
- *result += lv_cmake(
- lv_creal(input[i]) * lv_creal(taps[i]) + lv_cimag(input[i]) * lv_cimag(taps[i]),
- lv_cimag(input[i]) * lv_creal(taps[i]) - lv_creal(input[i]) * lv_cimag(taps[i]));
- }
+ sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01));
+ // … and now (s0 + s2) + (s1 + s3)
+ sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2)));
+ // Store result.
+ __m128 lower = _mm256_extractf128_ps(sum, 0);
+ _mm_storel_pi((__m64*)result, lower);
+
+ // Handle the last elements if num_points mod 4 is bigger than 0.
+ for (long unsigned i = num_points & ~3u; i < num_points; ++i) {
+ *result += lv_cmake(lv_creal(input[i]) * lv_creal(taps[i]) +
+ lv_cimag(input[i]) * lv_cimag(taps[i]),
+ lv_cimag(input[i]) * lv_creal(taps[i]) -
+ lv_creal(input[i]) * lv_cimag(taps[i]));
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE3
-#include <xmmintrin.h>
#include <pmmintrin.h>
+#include <xmmintrin.h>
static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse3(lv_32fc_t* result,
- const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
{
- // Partial sums for indices i and i+1.
- __m128 sum_a_mult_b_real = _mm_setzero_ps();
- __m128 sum_a_mult_b_imag = _mm_setzero_ps();
-
- for (long unsigned i = 0; i < (num_points & ~1u); i += 2) {
- /* Two complex elements a time are processed.
- * (ar + jâ‹…ai)*conj(br + jâ‹…bi) =
- * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
- */
+ // Partial sums for indices i and i+1.
+ __m128 sum_a_mult_b_real = _mm_setzero_ps();
+ __m128 sum_a_mult_b_imag = _mm_setzero_ps();
+
+ for (long unsigned i = 0; i < (num_points & ~1u); i += 2) {
+ /* Two complex elements a time are processed.
+ * (ar + jâ‹…ai)*conj(br + jâ‹…bi) =
+ * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
+ */
+
+ /* Load input and taps, split and duplicate real und imaginary parts of taps.
+ * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
+ * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
+ * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
+ * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
+ */
+ __m128 a = _mm_load_ps((const float*)&input[i]);
+ __m128 b = _mm_load_ps((const float*)&taps[i]);
+ __m128 b_real = _mm_moveldup_ps(b);
+ __m128 b_imag = _mm_movehdup_ps(b);
+
+ // Add | aiâ‹…br,i+1 | arâ‹…br,i+1 | aiâ‹…br,i+0 | arâ‹…br,i+0 | to partial sum.
+ sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real));
+ // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
+ sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag));
+ }
- /* Load input and taps, split and duplicate real und imaginary parts of taps.
- * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
- * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
- * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
- * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
- */
- __m128 a = _mm_load_ps((const float *) &input[i]);
- __m128 b = _mm_load_ps((const float *) &taps[i]);
- __m128 b_real = _mm_moveldup_ps(b);
- __m128 b_imag = _mm_movehdup_ps(b);
-
- // Add | aiâ‹…br,i+1 | arâ‹…br,i+1 | aiâ‹…br,i+0 | arâ‹…br,i+0 | to partial sum.
- sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real));
- // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
- sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag));
- }
-
- // Swap position of −ar⋅bi and ai⋅bi.
- sum_a_mult_b_imag = _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag,
- _MM_SHUFFLE(2, 3, 0, 1));
- // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums.
- __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
- // Sum the two partial sums.
- sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2)));
- // Store result.
- _mm_storel_pi((__m64 *) result, sum);
-
- // Handle the last element if num_points mod 2 is 1.
- if (num_points & 1u) {
- *result += lv_cmake(
- lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) +
- lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]),
- lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) -
- lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1]));
- }
+ // Swap position of −ar⋅bi and ai⋅bi.
+ sum_a_mult_b_imag =
+ _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
+ // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums.
+ __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
+ // Sum the two partial sums.
+ sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2)));
+ // Store result.
+ _mm_storel_pi((__m64*)result, sum);
+
+ // Handle the last element if num_points mod 2 is 1.
+ if (num_points & 1u) {
+ *result += lv_cmake(
+ lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) +
+ lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]),
+ lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) -
+ lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1]));
+ }
}
#endif /*LV_HAVE_SSE3*/
#ifdef LV_HAVE_GENERIC
-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
- const unsigned int num_bytes = num_points*8;
+ const unsigned int num_bytes = num_points * 8;
- float * res = (float*) result;
- float * in = (float*) input;
- float * tp = (float*) taps;
- unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+ float* res = (float*)result;
+ float* in = (float*)input;
+ float* tp = (float*)taps;
+ unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
- float sum0[2] = {0,0};
- float sum1[2] = {0,0};
- unsigned int i = 0;
+ float sum0[2] = { 0, 0 };
+ float sum1[2] = { 0, 0 };
+ unsigned int i = 0;
- for(i = 0; i < n_2_ccomplex_blocks; ++i) {
- sum0[0] += in[0] * tp[0] + in[1] * tp[1];
- sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
- sum1[0] += in[2] * tp[2] + in[3] * tp[3];
- sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
+ for (i = 0; i < n_2_ccomplex_blocks; ++i) {
+ sum0[0] += in[0] * tp[0] + in[1] * tp[1];
+ sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] + in[3] * tp[3];
+ sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
- in += 4;
- tp += 4;
- }
+ in += 4;
+ tp += 4;
+ }
- res[0] = sum0[0] + sum1[0];
- res[1] = sum0[1] + sum1[1];
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
- if (num_bytes >> 3 & 1) {
- *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
- }
+ if (num_bytes >> 3 & 1) {
+ *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
+ }
}
#endif /*LV_HAVE_GENERIC*/
#if LV_HAVE_SSE && LV_HAVE_64
-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
-
- const unsigned int num_bytes = num_points*8;
-
- __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
-
- __VOLK_ASM __VOLK_VOLATILE
- (
- "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
- "# const float *taps, unsigned num_bytes)\n\t"
- "# float sum0 = 0;\n\t"
- "# float sum1 = 0;\n\t"
- "# float sum2 = 0;\n\t"
- "# float sum3 = 0;\n\t"
- "# do {\n\t"
- "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
- "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
- "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
- "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
- "# input += 4;\n\t"
- "# taps += 4; \n\t"
- "# } while (--n_2_ccomplex_blocks != 0);\n\t"
- "# result[0] = sum0 + sum2;\n\t"
- "# result[1] = sum1 + sum3;\n\t"
- "# TODO: prefetch and better scheduling\n\t"
- " xor %%r9, %%r9\n\t"
- " xor %%r10, %%r10\n\t"
- " movq %[conjugator], %%r9\n\t"
- " movq %%rcx, %%rax\n\t"
- " movaps 0(%%r9), %%xmm8\n\t"
- " movq %%rcx, %%r8\n\t"
- " movq %[rsi], %%r9\n\t"
- " movq %[rdx], %%r10\n\t"
- " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
- " movaps 0(%%r9), %%xmm0\n\t"
- " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
- " movups 0(%%r10), %%xmm2\n\t"
- " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
- " shr $4, %%r8\n\t"
- " xorps %%xmm8, %%xmm2\n\t"
- " jmp .%=L1_test\n\t"
- " # 4 taps / loop\n\t"
- " # something like ?? cycles / loop\n\t"
- ".%=Loop1: \n\t"
- "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
- "# movaps (%%r9), %%xmmA\n\t"
- "# movaps (%%r10), %%xmmB\n\t"
- "# movaps %%xmmA, %%xmmZ\n\t"
- "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
- "# mulps %%xmmB, %%xmmA\n\t"
- "# mulps %%xmmZ, %%xmmB\n\t"
- "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
- "# xorps %%xmmPN, %%xmmA\n\t"
- "# movaps %%xmmA, %%xmmZ\n\t"
- "# unpcklps %%xmmB, %%xmmA\n\t"
- "# unpckhps %%xmmB, %%xmmZ\n\t"
- "# movaps %%xmmZ, %%xmmY\n\t"
- "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
- "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
- "# addps %%xmmZ, %%xmmA\n\t"
- "# addps %%xmmA, %%xmmC\n\t"
- "# A=xmm0, B=xmm2, Z=xmm4\n\t"
- "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
- " movaps 16(%%r9), %%xmm1\n\t"
- " movaps %%xmm0, %%xmm4\n\t"
- " mulps %%xmm2, %%xmm0\n\t"
- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
- " movaps 16(%%r10), %%xmm3\n\t"
- " movaps %%xmm1, %%xmm5\n\t"
- " xorps %%xmm8, %%xmm3\n\t"
- " addps %%xmm0, %%xmm6\n\t"
- " mulps %%xmm3, %%xmm1\n\t"
- " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
- " addps %%xmm1, %%xmm6\n\t"
- " mulps %%xmm4, %%xmm2\n\t"
- " movaps 32(%%r9), %%xmm0\n\t"
- " addps %%xmm2, %%xmm7\n\t"
- " mulps %%xmm5, %%xmm3\n\t"
- " add $32, %%r9\n\t"
- " movaps 32(%%r10), %%xmm2\n\t"
- " addps %%xmm3, %%xmm7\n\t"
- " add $32, %%r10\n\t"
- " xorps %%xmm8, %%xmm2\n\t"
- ".%=L1_test:\n\t"
- " dec %%rax\n\t"
- " jge .%=Loop1\n\t"
- " # We've handled the bulk of multiplies up to here.\n\t"
- " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
- " # If so, we've got 2 more taps to do.\n\t"
- " and $1, %%r8\n\t"
- " je .%=Leven\n\t"
- " # The count was odd, do 2 more taps.\n\t"
- " # Note that we've already got mm0/mm2 preloaded\n\t"
- " # from the main loop.\n\t"
- " movaps %%xmm0, %%xmm4\n\t"
- " mulps %%xmm2, %%xmm0\n\t"
- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
- " addps %%xmm0, %%xmm6\n\t"
- " mulps %%xmm4, %%xmm2\n\t"
- " addps %%xmm2, %%xmm7\n\t"
- ".%=Leven:\n\t"
- " # neg inversor\n\t"
- " xorps %%xmm1, %%xmm1\n\t"
- " mov $0x80000000, %%r9\n\t"
- " movd %%r9, %%xmm1\n\t"
- " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
- " # pfpnacc\n\t"
- " xorps %%xmm1, %%xmm6\n\t"
- " movaps %%xmm6, %%xmm2\n\t"
- " unpcklps %%xmm7, %%xmm6\n\t"
- " unpckhps %%xmm7, %%xmm2\n\t"
- " movaps %%xmm2, %%xmm3\n\t"
- " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
- " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
- " addps %%xmm2, %%xmm6\n\t"
- " # xmm6 = r1 i2 r3 i4\n\t"
- " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
- " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
- " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
- :
- :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator)
- :"rax", "r8", "r9", "r10"
- );
-
- int getem = num_bytes % 16;
-
- for(; getem > 0; getem -= 8) {
- *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
- }
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
+
+ const unsigned int num_bytes = num_points * 8;
+
+ __VOLK_ATTR_ALIGNED(16)
+ static const uint32_t conjugator[4] = {
+ 0x00000000, 0x80000000, 0x00000000, 0x80000000
+ };
+
+ __VOLK_ASM __VOLK_VOLATILE(
+ "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
+ "# const float *taps, unsigned num_bytes)\n\t"
+ "# float sum0 = 0;\n\t"
+ "# float sum1 = 0;\n\t"
+ "# float sum2 = 0;\n\t"
+ "# float sum3 = 0;\n\t"
+ "# do {\n\t"
+ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+ "# input += 4;\n\t"
+ "# taps += 4; \n\t"
+ "# } while (--n_2_ccomplex_blocks != 0);\n\t"
+ "# result[0] = sum0 + sum2;\n\t"
+ "# result[1] = sum1 + sum3;\n\t"
+ "# TODO: prefetch and better scheduling\n\t"
+ " xor %%r9, %%r9\n\t"
+ " xor %%r10, %%r10\n\t"
+ " movq %[conjugator], %%r9\n\t"
+ " movq %%rcx, %%rax\n\t"
+ " movaps 0(%%r9), %%xmm8\n\t"
+ " movq %%rcx, %%r8\n\t"
+ " movq %[rsi], %%r9\n\t"
+ " movq %[rdx], %%r10\n\t"
+ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+ " movaps 0(%%r9), %%xmm0\n\t"
+ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+ " movups 0(%%r10), %%xmm2\n\t"
+ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
+ " shr $4, %%r8\n\t"
+ " xorps %%xmm8, %%xmm2\n\t"
+ " jmp .%=L1_test\n\t"
+ " # 4 taps / loop\n\t"
+ " # something like ?? cycles / loop\n\t"
+ ".%=Loop1: \n\t"
+ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+ "# movaps (%%r9), %%xmmA\n\t"
+ "# movaps (%%r10), %%xmmB\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+ "# mulps %%xmmB, %%xmmA\n\t"
+ "# mulps %%xmmZ, %%xmmB\n\t"
+ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+ "# xorps %%xmmPN, %%xmmA\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# unpcklps %%xmmB, %%xmmA\n\t"
+ "# unpckhps %%xmmB, %%xmmZ\n\t"
+ "# movaps %%xmmZ, %%xmmY\n\t"
+ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+ "# addps %%xmmZ, %%xmmA\n\t"
+ "# addps %%xmmA, %%xmmC\n\t"
+ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+ " movaps 16(%%r9), %%xmm1\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " movaps 16(%%r10), %%xmm3\n\t"
+ " movaps %%xmm1, %%xmm5\n\t"
+ " xorps %%xmm8, %%xmm3\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm3, %%xmm1\n\t"
+ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+ " addps %%xmm1, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " movaps 32(%%r9), %%xmm0\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ " mulps %%xmm5, %%xmm3\n\t"
+ " add $32, %%r9\n\t"
+ " movaps 32(%%r10), %%xmm2\n\t"
+ " addps %%xmm3, %%xmm7\n\t"
+ " add $32, %%r10\n\t"
+ " xorps %%xmm8, %%xmm2\n\t"
+ ".%=L1_test:\n\t"
+ " dec %%rax\n\t"
+ " jge .%=Loop1\n\t"
+ " # We've handled the bulk of multiplies up to here.\n\t"
+ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+ " # If so, we've got 2 more taps to do.\n\t"
+ " and $1, %%r8\n\t"
+ " je .%=Leven\n\t"
+ " # The count was odd, do 2 more taps.\n\t"
+ " # Note that we've already got mm0/mm2 preloaded\n\t"
+ " # from the main loop.\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ ".%=Leven:\n\t"
+ " # neg inversor\n\t"
+ " xorps %%xmm1, %%xmm1\n\t"
+ " mov $0x80000000, %%r9\n\t"
+ " movd %%r9, %%xmm1\n\t"
+ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+ " # pfpnacc\n\t"
+ " xorps %%xmm1, %%xmm6\n\t"
+ " movaps %%xmm6, %%xmm2\n\t"
+ " unpcklps %%xmm7, %%xmm6\n\t"
+ " unpckhps %%xmm7, %%xmm2\n\t"
+ " movaps %%xmm2, %%xmm3\n\t"
+ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+ " addps %%xmm2, %%xmm6\n\t"
+ " # xmm6 = r1 i2 r3 i4\n\t"
+ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) "
+ "to memory\n\t"
+ :
+ : [rsi] "r"(input),
+ [rdx] "r"(taps),
+ "c"(num_bytes),
+ [rdi] "r"(result),
+ [conjugator] "r"(conjugator)
+ : "rax", "r8", "r9", "r10");
+
+ int getem = num_bytes % 16;
+
+ for (; getem > 0; getem -= 8) {
+ *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
+ }
}
#endif
#if LV_HAVE_SSE && LV_HAVE_32
-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
-
- const unsigned int num_bytes = num_points*8;
-
- __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
-
- int bound = num_bytes >> 4;
- int leftovers = num_bytes % 16;
-
- __VOLK_ASM __VOLK_VOLATILE
- (
- " #pushl %%ebp\n\t"
- " #movl %%esp, %%ebp\n\t"
- " #movl 12(%%ebp), %%eax # input\n\t"
- " #movl 16(%%ebp), %%edx # taps\n\t"
- " #movl 20(%%ebp), %%ecx # n_bytes\n\t"
- " movaps 0(%[conjugator]), %%xmm1\n\t"
- " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
- " movaps 0(%[eax]), %%xmm0\n\t"
- " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
- " movaps 0(%[edx]), %%xmm2\n\t"
- " movl %[ecx], (%[out])\n\t"
- " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t"
-
- " xorps %%xmm1, %%xmm2\n\t"
- " jmp .%=L1_test\n\t"
- " # 4 taps / loop\n\t"
- " # something like ?? cycles / loop\n\t"
- ".%=Loop1: \n\t"
- "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
- "# movaps (%[eax]), %%xmmA\n\t"
- "# movaps (%[edx]), %%xmmB\n\t"
- "# movaps %%xmmA, %%xmmZ\n\t"
- "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
- "# mulps %%xmmB, %%xmmA\n\t"
- "# mulps %%xmmZ, %%xmmB\n\t"
- "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
- "# xorps %%xmmPN, %%xmmA\n\t"
- "# movaps %%xmmA, %%xmmZ\n\t"
- "# unpcklps %%xmmB, %%xmmA\n\t"
- "# unpckhps %%xmmB, %%xmmZ\n\t"
- "# movaps %%xmmZ, %%xmmY\n\t"
- "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
- "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
- "# addps %%xmmZ, %%xmmA\n\t"
- "# addps %%xmmA, %%xmmC\n\t"
- "# A=xmm0, B=xmm2, Z=xmm4\n\t"
- "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
- " movaps 16(%[edx]), %%xmm3\n\t"
- " movaps %%xmm0, %%xmm4\n\t"
- " xorps %%xmm1, %%xmm3\n\t"
- " mulps %%xmm2, %%xmm0\n\t"
- " movaps 16(%[eax]), %%xmm1\n\t"
- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
- " movaps %%xmm1, %%xmm5\n\t"
- " addps %%xmm0, %%xmm6\n\t"
- " mulps %%xmm3, %%xmm1\n\t"
- " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
- " addps %%xmm1, %%xmm6\n\t"
- " movaps 0(%[conjugator]), %%xmm1\n\t"
- " mulps %%xmm4, %%xmm2\n\t"
- " movaps 32(%[eax]), %%xmm0\n\t"
- " addps %%xmm2, %%xmm7\n\t"
- " mulps %%xmm5, %%xmm3\n\t"
- " addl $32, %[eax]\n\t"
- " movaps 32(%[edx]), %%xmm2\n\t"
- " addps %%xmm3, %%xmm7\n\t"
- " xorps %%xmm1, %%xmm2\n\t"
- " addl $32, %[edx]\n\t"
- ".%=L1_test:\n\t"
- " decl %[ecx]\n\t"
- " jge .%=Loop1\n\t"
- " # We've handled the bulk of multiplies up to here.\n\t"
- " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
- " # If so, we've got 2 more taps to do.\n\t"
- " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t"
- " shrl $4, %[ecx]\n\t"
- " andl $1, %[ecx]\n\t"
- " je .%=Leven\n\t"
- " # The count was odd, do 2 more taps.\n\t"
- " # Note that we've already got mm0/mm2 preloaded\n\t"
- " # from the main loop.\n\t"
- " movaps %%xmm0, %%xmm4\n\t"
- " mulps %%xmm2, %%xmm0\n\t"
- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
- " addps %%xmm0, %%xmm6\n\t"
- " mulps %%xmm4, %%xmm2\n\t"
- " addps %%xmm2, %%xmm7\n\t"
- ".%=Leven:\n\t"
- " # neg inversor\n\t"
- " #movl 8(%%ebp), %[eax] \n\t"
- " xorps %%xmm1, %%xmm1\n\t"
- " movl $0x80000000, (%[out])\n\t"
- " movss (%[out]), %%xmm1\n\t"
- " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
- " # pfpnacc\n\t"
- " xorps %%xmm1, %%xmm6\n\t"
- " movaps %%xmm6, %%xmm2\n\t"
- " unpcklps %%xmm7, %%xmm6\n\t"
- " unpckhps %%xmm7, %%xmm2\n\t"
- " movaps %%xmm2, %%xmm3\n\t"
- " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
- " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
- " addps %%xmm2, %%xmm6\n\t"
- " # xmm6 = r1 i2 r3 i4\n\t"
- " #movl 8(%%ebp), %[eax] # @result\n\t"
- " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
- " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
- " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t"
- " #popl %%ebp\n\t"
- :
- : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator)
- );
-
- for(; leftovers > 0; leftovers -= 8) {
- *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
- }
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
+
+ const unsigned int num_bytes = num_points * 8;
+
+ __VOLK_ATTR_ALIGNED(16)
+ static const uint32_t conjugator[4] = {
+ 0x00000000, 0x80000000, 0x00000000, 0x80000000
+ };
+
+ int bound = num_bytes >> 4;
+ int leftovers = num_bytes % 16;
+
+ __VOLK_ASM __VOLK_VOLATILE(
+ " #pushl %%ebp\n\t"
+ " #movl %%esp, %%ebp\n\t"
+ " #movl 12(%%ebp), %%eax # input\n\t"
+ " #movl 16(%%ebp), %%edx # taps\n\t"
+ " #movl 20(%%ebp), %%ecx # n_bytes\n\t"
+ " movaps 0(%[conjugator]), %%xmm1\n\t"
+ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+ " movaps 0(%[eax]), %%xmm0\n\t"
+ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+ " movaps 0(%[edx]), %%xmm2\n\t"
+ " movl %[ecx], (%[out])\n\t"
+ " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t"
+
+ " xorps %%xmm1, %%xmm2\n\t"
+ " jmp .%=L1_test\n\t"
+ " # 4 taps / loop\n\t"
+ " # something like ?? cycles / loop\n\t"
+ ".%=Loop1: \n\t"
+ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+ "# movaps (%[eax]), %%xmmA\n\t"
+ "# movaps (%[edx]), %%xmmB\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+ "# mulps %%xmmB, %%xmmA\n\t"
+ "# mulps %%xmmZ, %%xmmB\n\t"
+ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+ "# xorps %%xmmPN, %%xmmA\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# unpcklps %%xmmB, %%xmmA\n\t"
+ "# unpckhps %%xmmB, %%xmmZ\n\t"
+ "# movaps %%xmmZ, %%xmmY\n\t"
+ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+ "# addps %%xmmZ, %%xmmA\n\t"
+ "# addps %%xmmA, %%xmmC\n\t"
+ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+ " movaps 16(%[edx]), %%xmm3\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " xorps %%xmm1, %%xmm3\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " movaps 16(%[eax]), %%xmm1\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " movaps %%xmm1, %%xmm5\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm3, %%xmm1\n\t"
+ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+ " addps %%xmm1, %%xmm6\n\t"
+ " movaps 0(%[conjugator]), %%xmm1\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " movaps 32(%[eax]), %%xmm0\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ " mulps %%xmm5, %%xmm3\n\t"
+ " addl $32, %[eax]\n\t"
+ " movaps 32(%[edx]), %%xmm2\n\t"
+ " addps %%xmm3, %%xmm7\n\t"
+ " xorps %%xmm1, %%xmm2\n\t"
+ " addl $32, %[edx]\n\t"
+ ".%=L1_test:\n\t"
+ " decl %[ecx]\n\t"
+ " jge .%=Loop1\n\t"
+ " # We've handled the bulk of multiplies up to here.\n\t"
+ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+ " # If so, we've got 2 more taps to do.\n\t"
+ " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t"
+ " shrl $4, %[ecx]\n\t"
+ " andl $1, %[ecx]\n\t"
+ " je .%=Leven\n\t"
+ " # The count was odd, do 2 more taps.\n\t"
+ " # Note that we've already got mm0/mm2 preloaded\n\t"
+ " # from the main loop.\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ ".%=Leven:\n\t"
+ " # neg inversor\n\t"
+ " #movl 8(%%ebp), %[eax] \n\t"
+ " xorps %%xmm1, %%xmm1\n\t"
+ " movl $0x80000000, (%[out])\n\t"
+ " movss (%[out]), %%xmm1\n\t"
+ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+ " # pfpnacc\n\t"
+ " xorps %%xmm1, %%xmm6\n\t"
+ " movaps %%xmm6, %%xmm2\n\t"
+ " unpcklps %%xmm7, %%xmm6\n\t"
+ " unpckhps %%xmm7, %%xmm2\n\t"
+ " movaps %%xmm2, %%xmm3\n\t"
+ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+ " addps %%xmm2, %%xmm6\n\t"
+ " # xmm6 = r1 i2 r3 i4\n\t"
+ " #movl 8(%%ebp), %[eax] # @result\n\t"
+ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+ " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) "
+ "to memory\n\t"
+ " #popl %%ebp\n\t"
+ :
+ : [eax] "r"(input),
+ [edx] "r"(taps),
+ [ecx] "r"(num_bytes),
+ [out] "r"(result),
+ [conjugator] "r"(conjugator));
+
+ for (; leftovers > 0; leftovers -= 8) {
+ *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
+ }
}
#endif /*LV_HAVE_SSE*/
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_x2_divide_32fc(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector, const lv_32fc_t* denumeratorVector, unsigned int num_points);
- * \endcode
+ * void volk_32fc_x2_divide_32fc(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
+ * const lv_32fc_t* denumeratorVector, unsigned int num_points); \endcode
*
* \b Inputs
* \li numeratorVector: The numerator complex values.
* \li outputVector: The output vector complex floats.
*
* \b Example
- * divide a complex vector by itself, demonstrating the result should be pretty close to 1+0j.
+ * divide a complex vector by itself, demonstrating the result should be pretty close to
+ * 1+0j.
*
* \code
* int N = 10;
#ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
#define INCLUDED_volk_32fc_x2_divide_32fc_u_H
+#include <float.h>
#include <inttypes.h>
#include <volk/volk_complex.h>
-#include <float.h>
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
#include <volk/volk_sse3_intrinsics.h>
-static inline void
-volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
- const lv_32fc_t* denumeratorVector, unsigned int num_points)
+static inline void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector,
+ const lv_32fc_t* numeratorVector,
+ const lv_32fc_t* denumeratorVector,
+ unsigned int num_points)
{
/*
* we'll do the "classical"
* --- = -------
* b |b|^2
* */
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- __m128 num01, num23, den01, den23, norm, result;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = numeratorVector;
- const lv_32fc_t* b = denumeratorVector;
-
- for(; number < quarterPoints; number++){
- num01 = _mm_loadu_ps((float*) a); // first pair
- den01 = _mm_loadu_ps((float*) b); // first pair
- num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
- a += 2;
- b += 2;
-
- num23 = _mm_loadu_ps((float*) a); // second pair
- den23 = _mm_loadu_ps((float*) b); // second pair
- num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
- a += 2;
- b += 2;
-
- norm = _mm_magnitudesquared_ps_sse3(den01, den23);
- den01 = _mm_unpacklo_ps(norm,norm);
- den23 = _mm_unpackhi_ps(norm,norm);
-
- result = _mm_div_ps(num01, den01);
- _mm_storeu_ps((float*) c, result); // Store the results back into the C container
- c += 2;
- result = _mm_div_ps(num23, den23);
- _mm_storeu_ps((float*) c, result); // Store the results back into the C container
- c += 2;
- }
-
- number *= 4;
- for(;number < num_points; number++){
- *c = (*a) / (*b);
- a++; b++; c++;
- }
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 num01, num23, den01, den23, norm, result;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = numeratorVector;
+ const lv_32fc_t* b = denumeratorVector;
+
+ for (; number < quarterPoints; number++) {
+ num01 = _mm_loadu_ps((float*)a); // first pair
+ den01 = _mm_loadu_ps((float*)b); // first pair
+ num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
+ a += 2;
+ b += 2;
+
+ num23 = _mm_loadu_ps((float*)a); // second pair
+ den23 = _mm_loadu_ps((float*)b); // second pair
+ num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
+ a += 2;
+ b += 2;
+
+ norm = _mm_magnitudesquared_ps_sse3(den01, den23);
+ den01 = _mm_unpacklo_ps(norm, norm);
+ den23 = _mm_unpackhi_ps(norm, norm);
+
+ result = _mm_div_ps(num01, den01);
+ _mm_storeu_ps((float*)c, result); // Store the results back into the C container
+ c += 2;
+ result = _mm_div_ps(num23, den23);
+ _mm_storeu_ps((float*)c, result); // Store the results back into the C container
+ c += 2;
+ }
+
+ number *= 4;
+ for (; number < num_points; number++) {
+ *c = (*a) / (*b);
+ a++;
+ b++;
+ c++;
+ }
}
#endif /* LV_HAVE_SSE3 */
#include <immintrin.h>
#include <volk/volk_avx_intrinsics.h>
-static inline void
-volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
- const lv_32fc_t* denumeratorVector, unsigned int num_points)
+static inline void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector,
+ const lv_32fc_t* numeratorVector,
+ const lv_32fc_t* denumeratorVector,
+ unsigned int num_points)
{
/*
* we'll do the "classical"
const lv_32fc_t* a = numeratorVector;
const lv_32fc_t* b = denumeratorVector;
- for(; number < quarterPoints; number++){
- num = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
- denum = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+ for (; number < quarterPoints; number++) {
+ num = _mm256_loadu_ps(
+ (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+ denum = _mm256_loadu_ps(
+ (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
mul_conj = _mm256_complexconjugatemul_ps(num, denum);
sq = _mm256_mul_ps(denum, denum); // Square the values
- mag_sq_un = _mm256_hadd_ps(sq,sq); // obtain the actual squared magnitude, although out of order
+ mag_sq_un = _mm256_hadd_ps(
+ sq, sq); // obtain the actual squared magnitude, although out of order
mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
- // best guide I found on using these functions: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
- div = _mm256_div_ps(mul_conj,mag_sq);
+ // best guide I found on using these functions:
+ // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
+ div = _mm256_div_ps(mul_conj, mag_sq);
- _mm256_storeu_ps((float*) c, div); // Store the results back into the C container
+ _mm256_storeu_ps((float*)c, div); // Store the results back into the C container
a += 4;
b += 4;
number = quarterPoints * 4;
- for(; number < num_points; number++){
+ for (; number < num_points; number++) {
*c++ = (*a++) / (*b++);
}
-
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- const lv_32fc_t* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++) / (*bPtr++);
- }
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) / (*bPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
-
#endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */
#ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
#define INCLUDED_volk_32fc_x2_divide_32fc_a_H
+#include <float.h>
#include <inttypes.h>
#include <stdio.h>
#include <volk/volk_complex.h>
-#include <float.h>
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
#include <volk/volk_sse3_intrinsics.h>
-static inline void
-volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
- const lv_32fc_t* denumeratorVector, unsigned int num_points)
+static inline void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector,
+ const lv_32fc_t* numeratorVector,
+ const lv_32fc_t* denumeratorVector,
+ unsigned int num_points)
{
/*
* we'll do the "classical"
* --- = -------
* b |b|^2
* */
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- __m128 num01, num23, den01, den23, norm, result;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = numeratorVector;
- const lv_32fc_t* b = denumeratorVector;
-
- for(; number < quarterPoints; number++){
- num01 = _mm_load_ps((float*) a); // first pair
- den01 = _mm_load_ps((float*) b); // first pair
- num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
- a += 2;
- b += 2;
-
- num23 = _mm_load_ps((float*) a); // second pair
- den23 = _mm_load_ps((float*) b); // second pair
- num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
- a += 2;
- b += 2;
-
- norm = _mm_magnitudesquared_ps_sse3(den01, den23);
-
- den01 = _mm_unpacklo_ps(norm,norm); // select the lower floats twice
- den23 = _mm_unpackhi_ps(norm,norm); // select the upper floats twice
-
- result = _mm_div_ps(num01, den01);
- _mm_store_ps((float*) c, result); // Store the results back into the C container
- c += 2;
- result = _mm_div_ps(num23, den23);
- _mm_store_ps((float*) c, result); // Store the results back into the C container
- c += 2;
- }
-
- number *= 4;
- for(;number < num_points; number++){
- *c = (*a) / (*b);
- a++; b++; c++;
- }
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 num01, num23, den01, den23, norm, result;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = numeratorVector;
+ const lv_32fc_t* b = denumeratorVector;
+
+ for (; number < quarterPoints; number++) {
+ num01 = _mm_load_ps((float*)a); // first pair
+ den01 = _mm_load_ps((float*)b); // first pair
+ num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
+ a += 2;
+ b += 2;
+
+ num23 = _mm_load_ps((float*)a); // second pair
+ den23 = _mm_load_ps((float*)b); // second pair
+ num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
+ a += 2;
+ b += 2;
+
+ norm = _mm_magnitudesquared_ps_sse3(den01, den23);
+
+ den01 = _mm_unpacklo_ps(norm, norm); // select the lower floats twice
+ den23 = _mm_unpackhi_ps(norm, norm); // select the upper floats twice
+
+ result = _mm_div_ps(num01, den01);
+ _mm_store_ps((float*)c, result); // Store the results back into the C container
+ c += 2;
+ result = _mm_div_ps(num23, den23);
+ _mm_store_ps((float*)c, result); // Store the results back into the C container
+ c += 2;
+ }
+
+ number *= 4;
+ for (; number < num_points; number++) {
+ *c = (*a) / (*b);
+ a++;
+ b++;
+ c++;
+ }
}
#endif /* LV_HAVE_SSE */
#include <immintrin.h>
#include <volk/volk_avx_intrinsics.h>
-static inline void
-volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
- const lv_32fc_t* denumeratorVector, unsigned int num_points)
+static inline void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector,
+ const lv_32fc_t* numeratorVector,
+ const lv_32fc_t* denumeratorVector,
+ unsigned int num_points)
{
/*
* we'll do the "classical"
const lv_32fc_t* a = numeratorVector;
const lv_32fc_t* b = denumeratorVector;
- for(; number < quarterPoints; number++){
- num = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
- denum = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+ for (; number < quarterPoints; number++) {
+ num =
+ _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+ denum =
+ _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
mul_conj = _mm256_complexconjugatemul_ps(num, denum);
sq = _mm256_mul_ps(denum, denum); // Square the values
- mag_sq_un = _mm256_hadd_ps(sq,sq); // obtain the actual squared magnitude, although out of order
+ mag_sq_un = _mm256_hadd_ps(
+ sq, sq); // obtain the actual squared magnitude, although out of order
mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
- // best guide I found on using these functions: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
- div = _mm256_div_ps(mul_conj,mag_sq);
+ // best guide I found on using these functions:
+ // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
+ div = _mm256_div_ps(mul_conj, mag_sq);
- _mm256_store_ps((float*) c, div); // Store the results back into the C container
+ _mm256_store_ps((float*)c, div); // Store the results back into the C container
a += 4;
b += 4;
number = quarterPoints * 4;
- for(; number < num_points; number++){
+ for (; number < num_points; number++) {
*c++ = (*a++) / (*b++);
}
-
-
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- const lv_32fc_t* bPtr = bVector;
-
- float32x4x2_t aVal, bVal, cVal;
- float32x4_t bAbs, bAbsInv;
-
- const unsigned int quarterPoints = num_points / 4;
- unsigned int number = 0;
- for(; number < quarterPoints; number++){
- aVal = vld2q_f32((const float*)(aPtr));
- bVal = vld2q_f32((const float*)(bPtr));
- aPtr += 4;
- bPtr += 4;
- __VOLK_PREFETCH(aPtr+4);
- __VOLK_PREFETCH(bPtr+4);
-
- bAbs = vmulq_f32( bVal.val[0], bVal.val[0]);
- bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
-
- bAbsInv = vrecpeq_f32(bAbs);
- bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
- bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
-
- cVal.val[0] = vmulq_f32( aVal.val[0], bVal.val[0]);
- cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
- cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
-
- cVal.val[1] = vmulq_f32( aVal.val[1], bVal.val[0]);
- cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
- cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
-
- vst2q_f32((float*)(cPtr), cVal);
- cPtr += 4;
- }
-
- for(number = quarterPoints * 4; number < num_points; number++){
- *cPtr++ = (*aPtr++) / (*bPtr++);
- }
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr = bVector;
+
+ float32x4x2_t aVal, bVal, cVal;
+ float32x4_t bAbs, bAbsInv;
+
+ const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ for (; number < quarterPoints; number++) {
+ aVal = vld2q_f32((const float*)(aPtr));
+ bVal = vld2q_f32((const float*)(bPtr));
+ aPtr += 4;
+ bPtr += 4;
+ __VOLK_PREFETCH(aPtr + 4);
+ __VOLK_PREFETCH(bPtr + 4);
+
+ bAbs = vmulq_f32(bVal.val[0], bVal.val[0]);
+ bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
+
+ bAbsInv = vrecpeq_f32(bAbs);
+ bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
+ bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
+
+ cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]);
+ cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
+ cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
+
+ cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]);
+ cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
+ cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
+
+ vst2q_f32((float*)(cPtr), cVal);
+ cPtr += 4;
+ }
+
+ for (number = quarterPoints * 4; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) / (*bPtr++);
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32fc_x2_divide_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_divide_32fc_a_generic(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- const lv_32fc_t* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++) / (*bPtr++);
- }
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) / (*bPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_x2_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
- * \endcode
+ * void volk_32fc_x2_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const
+ * lv_32fc_t* taps, unsigned int num_points) \endcode
*
* \b Inputs
* \li input: vector of complex floats.
#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
#define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
-#include <volk/volk_common.h>
-#include <volk/volk_complex.h>
#include <stdio.h>
#include <string.h>
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
#ifdef LV_HAVE_GENERIC
-static inline void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
- float * res = (float*) result;
- float * in = (float*) input;
- float * tp = (float*) taps;
- unsigned int n_2_ccomplex_blocks = num_points/2;
+ float* res = (float*)result;
+ float* in = (float*)input;
+ float* tp = (float*)taps;
+ unsigned int n_2_ccomplex_blocks = num_points / 2;
- float sum0[2] = {0,0};
- float sum1[2] = {0,0};
- unsigned int i = 0;
+ float sum0[2] = { 0, 0 };
+ float sum1[2] = { 0, 0 };
+ unsigned int i = 0;
- for(i = 0; i < n_2_ccomplex_blocks; ++i) {
- sum0[0] += in[0] * tp[0] - in[1] * tp[1];
- sum0[1] += in[0] * tp[1] + in[1] * tp[0];
- sum1[0] += in[2] * tp[2] - in[3] * tp[3];
- sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+ for (i = 0; i < n_2_ccomplex_blocks; ++i) {
+ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
- in += 4;
- tp += 4;
- }
+ in += 4;
+ tp += 4;
+ }
- res[0] = sum0[0] + sum1[0];
- res[1] = sum0[1] + sum1[1];
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
- // Cleanup if we had an odd number of points
- if (num_points & 1) {
- *result += input[num_points - 1] * taps[num_points - 1];
- }
+ // Cleanup if we had an odd number of points
+ if (num_points & 1) {
+ *result += input[num_points - 1] * taps[num_points - 1];
+ }
}
#endif /*LV_HAVE_GENERIC*/
-
#if LV_HAVE_SSE && LV_HAVE_64
-static inline void volk_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
-
- const unsigned int num_bytes = num_points*8;
- unsigned int isodd = num_points & 1;
-
- __VOLK_ASM
- (
- "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
- "# const float *taps, unsigned num_bytes)\n\t"
- "# float sum0 = 0;\n\t"
- "# float sum1 = 0;\n\t"
- "# float sum2 = 0;\n\t"
- "# float sum3 = 0;\n\t"
- "# do {\n\t"
- "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
- "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
- "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
- "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
- "# input += 4;\n\t"
- "# taps += 4; \n\t"
- "# } while (--n_2_ccomplex_blocks != 0);\n\t"
- "# result[0] = sum0 + sum2;\n\t"
- "# result[1] = sum1 + sum3;\n\t"
- "# TODO: prefetch and better scheduling\n\t"
- " xor %%r9, %%r9\n\t"
- " xor %%r10, %%r10\n\t"
- " movq %%rcx, %%rax\n\t"
- " movq %%rcx, %%r8\n\t"
- " movq %[rsi], %%r9\n\t"
- " movq %[rdx], %%r10\n\t"
- " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
- " movups 0(%%r9), %%xmm0\n\t"
- " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
- " movups 0(%%r10), %%xmm2\n\t"
- " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
- " shr $4, %%r8\n\t"
- " jmp .%=L1_test\n\t"
- " # 4 taps / loop\n\t"
- " # something like ?? cycles / loop\n\t"
- ".%=Loop1: \n\t"
- "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
- "# movups (%%r9), %%xmmA\n\t"
- "# movups (%%r10), %%xmmB\n\t"
- "# movups %%xmmA, %%xmmZ\n\t"
- "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
- "# mulps %%xmmB, %%xmmA\n\t"
- "# mulps %%xmmZ, %%xmmB\n\t"
- "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
- "# xorps %%xmmPN, %%xmmA\n\t"
- "# movups %%xmmA, %%xmmZ\n\t"
- "# unpcklps %%xmmB, %%xmmA\n\t"
- "# unpckhps %%xmmB, %%xmmZ\n\t"
- "# movups %%xmmZ, %%xmmY\n\t"
- "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
- "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
- "# addps %%xmmZ, %%xmmA\n\t"
- "# addps %%xmmA, %%xmmC\n\t"
- "# A=xmm0, B=xmm2, Z=xmm4\n\t"
- "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
- " movups 16(%%r9), %%xmm1\n\t"
- " movups %%xmm0, %%xmm4\n\t"
- " mulps %%xmm2, %%xmm0\n\t"
- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
- " movups 16(%%r10), %%xmm3\n\t"
- " movups %%xmm1, %%xmm5\n\t"
- " addps %%xmm0, %%xmm6\n\t"
- " mulps %%xmm3, %%xmm1\n\t"
- " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
- " addps %%xmm1, %%xmm6\n\t"
- " mulps %%xmm4, %%xmm2\n\t"
- " movups 32(%%r9), %%xmm0\n\t"
- " addps %%xmm2, %%xmm7\n\t"
- " mulps %%xmm5, %%xmm3\n\t"
- " add $32, %%r9\n\t"
- " movups 32(%%r10), %%xmm2\n\t"
- " addps %%xmm3, %%xmm7\n\t"
- " add $32, %%r10\n\t"
- ".%=L1_test:\n\t"
- " dec %%rax\n\t"
- " jge .%=Loop1\n\t"
- " # We've handled the bulk of multiplies up to here.\n\t"
- " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
- " # If so, we've got 2 more taps to do.\n\t"
- " and $1, %%r8\n\t"
- " je .%=Leven\n\t"
- " # The count was odd, do 2 more taps.\n\t"
- " # Note that we've already got mm0/mm2 preloaded\n\t"
- " # from the main loop.\n\t"
- " movups %%xmm0, %%xmm4\n\t"
- " mulps %%xmm2, %%xmm0\n\t"
- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
- " addps %%xmm0, %%xmm6\n\t"
- " mulps %%xmm4, %%xmm2\n\t"
- " addps %%xmm2, %%xmm7\n\t"
- ".%=Leven:\n\t"
- " # neg inversor\n\t"
- " xorps %%xmm1, %%xmm1\n\t"
- " mov $0x80000000, %%r9\n\t"
- " movd %%r9, %%xmm1\n\t"
- " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
- " # pfpnacc\n\t"
- " xorps %%xmm1, %%xmm6\n\t"
- " movups %%xmm6, %%xmm2\n\t"
- " unpcklps %%xmm7, %%xmm6\n\t"
- " unpckhps %%xmm7, %%xmm2\n\t"
- " movups %%xmm2, %%xmm3\n\t"
- " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
- " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
- " addps %%xmm2, %%xmm6\n\t"
- " # xmm6 = r1 i2 r3 i4\n\t"
- " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
- " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
- " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
- :
- :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
- :"rax", "r8", "r9", "r10"
- );
-
-
- if(isodd) {
- *result += input[num_points - 1] * taps[num_points - 1];
- }
-
- return;
+static inline void volk_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
+
+ const unsigned int num_bytes = num_points * 8;
+ unsigned int isodd = num_points & 1;
+
+ __VOLK_ASM(
+ "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
+ "# const float *taps, unsigned num_bytes)\n\t"
+ "# float sum0 = 0;\n\t"
+ "# float sum1 = 0;\n\t"
+ "# float sum2 = 0;\n\t"
+ "# float sum3 = 0;\n\t"
+ "# do {\n\t"
+ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+ "# input += 4;\n\t"
+ "# taps += 4; \n\t"
+ "# } while (--n_2_ccomplex_blocks != 0);\n\t"
+ "# result[0] = sum0 + sum2;\n\t"
+ "# result[1] = sum1 + sum3;\n\t"
+ "# TODO: prefetch and better scheduling\n\t"
+ " xor %%r9, %%r9\n\t"
+ " xor %%r10, %%r10\n\t"
+ " movq %%rcx, %%rax\n\t"
+ " movq %%rcx, %%r8\n\t"
+ " movq %[rsi], %%r9\n\t"
+ " movq %[rdx], %%r10\n\t"
+ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+ " movups 0(%%r9), %%xmm0\n\t"
+ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+ " movups 0(%%r10), %%xmm2\n\t"
+ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
+ " shr $4, %%r8\n\t"
+ " jmp .%=L1_test\n\t"
+ " # 4 taps / loop\n\t"
+ " # something like ?? cycles / loop\n\t"
+ ".%=Loop1: \n\t"
+ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+ "# movups (%%r9), %%xmmA\n\t"
+ "# movups (%%r10), %%xmmB\n\t"
+ "# movups %%xmmA, %%xmmZ\n\t"
+ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+ "# mulps %%xmmB, %%xmmA\n\t"
+ "# mulps %%xmmZ, %%xmmB\n\t"
+ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+ "# xorps %%xmmPN, %%xmmA\n\t"
+ "# movups %%xmmA, %%xmmZ\n\t"
+ "# unpcklps %%xmmB, %%xmmA\n\t"
+ "# unpckhps %%xmmB, %%xmmZ\n\t"
+ "# movups %%xmmZ, %%xmmY\n\t"
+ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+ "# addps %%xmmZ, %%xmmA\n\t"
+ "# addps %%xmmA, %%xmmC\n\t"
+ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+ " movups 16(%%r9), %%xmm1\n\t"
+ " movups %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " movups 16(%%r10), %%xmm3\n\t"
+ " movups %%xmm1, %%xmm5\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm3, %%xmm1\n\t"
+ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+ " addps %%xmm1, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " movups 32(%%r9), %%xmm0\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ " mulps %%xmm5, %%xmm3\n\t"
+ " add $32, %%r9\n\t"
+ " movups 32(%%r10), %%xmm2\n\t"
+ " addps %%xmm3, %%xmm7\n\t"
+ " add $32, %%r10\n\t"
+ ".%=L1_test:\n\t"
+ " dec %%rax\n\t"
+ " jge .%=Loop1\n\t"
+ " # We've handled the bulk of multiplies up to here.\n\t"
+ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+ " # If so, we've got 2 more taps to do.\n\t"
+ " and $1, %%r8\n\t"
+ " je .%=Leven\n\t"
+ " # The count was odd, do 2 more taps.\n\t"
+ " # Note that we've already got mm0/mm2 preloaded\n\t"
+ " # from the main loop.\n\t"
+ " movups %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ ".%=Leven:\n\t"
+ " # neg inversor\n\t"
+ " xorps %%xmm1, %%xmm1\n\t"
+ " mov $0x80000000, %%r9\n\t"
+ " movd %%r9, %%xmm1\n\t"
+ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+ " # pfpnacc\n\t"
+ " xorps %%xmm1, %%xmm6\n\t"
+ " movups %%xmm6, %%xmm2\n\t"
+ " unpcklps %%xmm7, %%xmm6\n\t"
+ " unpckhps %%xmm7, %%xmm2\n\t"
+ " movups %%xmm2, %%xmm3\n\t"
+ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+ " addps %%xmm2, %%xmm6\n\t"
+ " # xmm6 = r1 i2 r3 i4\n\t"
+ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) "
+ "to memory\n\t"
+ :
+ : [rsi] "r"(input), [rdx] "r"(taps), "c"(num_bytes), [rdi] "r"(result)
+ : "rax", "r8", "r9", "r10");
+
+
+ if (isodd) {
+ *result += input[num_points - 1] * taps[num_points - 1];
+ }
+ return;
}
#endif /* LV_HAVE_SSE && LV_HAVE_64 */
-
-
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
-static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
- lv_32fc_t dotProduct;
- memset(&dotProduct, 0x0, 2*sizeof(float));
+ lv_32fc_t dotProduct;
+ memset(&dotProduct, 0x0, 2 * sizeof(float));
- unsigned int number = 0;
- const unsigned int halfPoints = num_points/2;
- unsigned int isodd = num_points & 1;
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+ unsigned int isodd = num_points & 1;
- __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
- const lv_32fc_t* a = input;
- const lv_32fc_t* b = taps;
+ const lv_32fc_t* a = input;
+ const lv_32fc_t* b = taps;
- dotProdVal = _mm_setzero_ps();
+ dotProdVal = _mm_setzero_ps();
- for(;number < halfPoints; number++){
+ for (; number < halfPoints; number++) {
- x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
- y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
- yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
- yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
- tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
- x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+ x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
- tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+ tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
- z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z = _mm_addsub_ps(tmp1,
+ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
- dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+ dotProdVal =
+ _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
- a += 2;
- b += 2;
- }
+ a += 2;
+ b += 2;
+ }
- __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
- _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)dotProductVector,
+ dotProdVal); // Store the results back into the dot product vector
- dotProduct += ( dotProductVector[0] + dotProductVector[1] );
+ dotProduct += (dotProductVector[0] + dotProductVector[1]);
- if(isodd) {
- dotProduct += input[num_points - 1] * taps[num_points - 1];
- }
+ if (isodd) {
+ dotProduct += input[num_points - 1] * taps[num_points - 1];
+ }
- *result = dotProduct;
+ *result = dotProduct;
}
#endif /*LV_HAVE_SSE3*/
#include <smmintrin.h>
-static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
- unsigned int i = 0;
- const unsigned int qtr_points = num_points/4;
- const unsigned int isodd = num_points & 3;
+ unsigned int i = 0;
+ const unsigned int qtr_points = num_points / 4;
+ const unsigned int isodd = num_points & 3;
- __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
- float *p_input, *p_taps;
- __m64 *p_result;
+ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
+ float *p_input, *p_taps;
+ __m64* p_result;
- p_result = (__m64*)result;
- p_input = (float*)input;
- p_taps = (float*)taps;
+ p_result = (__m64*)result;
+ p_input = (float*)input;
+ p_taps = (float*)taps;
- static const __m128i neg = {0x000000000000000080000000};
+ static const __m128i neg = { 0x000000000000000080000000 };
- real0 = _mm_setzero_ps();
- real1 = _mm_setzero_ps();
- im0 = _mm_setzero_ps();
- im1 = _mm_setzero_ps();
+ real0 = _mm_setzero_ps();
+ real1 = _mm_setzero_ps();
+ im0 = _mm_setzero_ps();
+ im1 = _mm_setzero_ps();
- for(; i < qtr_points; ++i) {
- xmm0 = _mm_loadu_ps(p_input);
- xmm1 = _mm_loadu_ps(p_taps);
+ for (; i < qtr_points; ++i) {
+ xmm0 = _mm_loadu_ps(p_input);
+ xmm1 = _mm_loadu_ps(p_taps);
- p_input += 4;
- p_taps += 4;
+ p_input += 4;
+ p_taps += 4;
- xmm2 = _mm_loadu_ps(p_input);
- xmm3 = _mm_loadu_ps(p_taps);
+ xmm2 = _mm_loadu_ps(p_input);
+ xmm3 = _mm_loadu_ps(p_taps);
- p_input += 4;
- p_taps += 4;
+ p_input += 4;
+ p_taps += 4;
- xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
- xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
- xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
- xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
+ xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
+ xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
+ xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
+ xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
- //imaginary vector from input
- xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
- //real vector from input
- xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
- //imaginary vector from taps
- xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
- //real vector from taps
- xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
+ // imaginary vector from input
+ xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
+ // real vector from input
+ xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
+ // imaginary vector from taps
+ xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
+ // real vector from taps
+ xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
- xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
- xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
+ xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
+ xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
- xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
- xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
+ xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
+ xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
- real0 = _mm_add_ps(xmm4, real0);
- real1 = _mm_add_ps(xmm5, real1);
- im0 = _mm_add_ps(xmm6, im0);
- im1 = _mm_add_ps(xmm7, im1);
- }
+ real0 = _mm_add_ps(xmm4, real0);
+ real1 = _mm_add_ps(xmm5, real1);
+ im0 = _mm_add_ps(xmm6, im0);
+ im1 = _mm_add_ps(xmm7, im1);
+ }
- real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
+ real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
- im0 = _mm_add_ps(im0, im1);
- real0 = _mm_add_ps(real0, real1);
+ im0 = _mm_add_ps(im0, im1);
+ real0 = _mm_add_ps(real0, real1);
- im0 = _mm_add_ps(im0, real0);
+ im0 = _mm_add_ps(im0, real0);
- _mm_storel_pi(p_result, im0);
+ _mm_storel_pi(p_result, im0);
- for(i = num_points-isodd; i < num_points; i++) {
- *result += input[i] * taps[i];
- }
+ for (i = num_points - isodd; i < num_points; i++) {
+ *result += input[i] * taps[i];
+ }
}
#endif /*LV_HAVE_SSE4_1*/
#include <immintrin.h>
-static inline void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
- unsigned int isodd = num_points & 3;
- unsigned int i = 0;
- lv_32fc_t dotProduct;
- memset(&dotProduct, 0x0, 2*sizeof(float));
+ unsigned int isodd = num_points & 3;
+ unsigned int i = 0;
+ lv_32fc_t dotProduct;
+ memset(&dotProduct, 0x0, 2 * sizeof(float));
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+ __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
- const lv_32fc_t* a = input;
- const lv_32fc_t* b = taps;
+ const lv_32fc_t* a = input;
+ const lv_32fc_t* b = taps;
- dotProdVal = _mm256_setzero_ps();
+ dotProdVal = _mm256_setzero_ps();
- for(;number < quarterPoints; number++){
- x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
- y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
+ for (; number < quarterPoints; number++) {
+ x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
+ y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
- yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
- yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
- tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+ tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
- x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
+ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
- tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
- z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z = _mm256_addsub_ps(tmp1,
+ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
- dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex multiplication results together
+ dotProdVal = _mm256_add_ps(dotProdVal,
+ z); // Add the complex multiplication results together
- a += 4;
- b += 4;
- }
+ a += 4;
+ b += 4;
+ }
- __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
- _mm256_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+ _mm256_storeu_ps((float*)dotProductVector,
+ dotProdVal); // Store the results back into the dot product vector
- dotProduct += ( dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + dotProductVector[3]);
+ dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
+ dotProductVector[3]);
- for(i = num_points-isodd; i < num_points; i++) {
- dotProduct += input[i] * taps[i];
- }
+ for (i = num_points - isodd; i < num_points; i++) {
+ dotProduct += input[i] * taps[i];
+ }
- *result = dotProduct;
+ *result = dotProduct;
}
#endif /*LV_HAVE_AVX*/
#if LV_HAVE_AVX && LV_HAVE_FMA
#include <immintrin.h>
-static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
- unsigned int isodd = num_points & 3;
- unsigned int i = 0;
- lv_32fc_t dotProduct;
- memset(&dotProduct, 0x0, 2*sizeof(float));
+ unsigned int isodd = num_points & 3;
+ unsigned int i = 0;
+ lv_32fc_t dotProduct;
+ memset(&dotProduct, 0x0, 2 * sizeof(float));
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+ __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
- const lv_32fc_t* a = input;
- const lv_32fc_t* b = taps;
+ const lv_32fc_t* a = input;
+ const lv_32fc_t* b = taps;
- dotProdVal = _mm256_setzero_ps();
+ dotProdVal = _mm256_setzero_ps();
- for(;number < quarterPoints; number++){
+ for (; number < quarterPoints; number++) {
- x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
- y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
+ x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
+ y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
- yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
- yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
- tmp1 = x;
+ tmp1 = x;
- x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
+ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
- tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
- z = _mm256_fmaddsub_ps(tmp1, yl,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z = _mm256_fmaddsub_ps(
+ tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
- dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex multiplication results together
+ dotProdVal = _mm256_add_ps(dotProdVal,
+ z); // Add the complex multiplication results together
- a += 4;
- b += 4;
- }
+ a += 4;
+ b += 4;
+ }
- __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
- _mm256_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+ _mm256_storeu_ps((float*)dotProductVector,
+ dotProdVal); // Store the results back into the dot product vector
- dotProduct += ( dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + dotProductVector[3]);
+ dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
+ dotProductVector[3]);
- for(i = num_points-isodd; i < num_points; i++) {
- dotProduct += input[i] * taps[i];
- }
+ for (i = num_points - isodd; i < num_points; i++) {
+ dotProduct += input[i] * taps[i];
+ }
- *result = dotProduct;
+ *result = dotProduct;
}
#endif /*LV_HAVE_AVX && LV_HAVE_FMA*/
#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
-#include <volk/volk_common.h>
-#include <volk/volk_complex.h>
#include <stdio.h>
#include <string.h>
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
#ifdef LV_HAVE_GENERIC
-static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
- const unsigned int num_bytes = num_points*8;
+ const unsigned int num_bytes = num_points * 8;
- float * res = (float*) result;
- float * in = (float*) input;
- float * tp = (float*) taps;
- unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+ float* res = (float*)result;
+ float* in = (float*)input;
+ float* tp = (float*)taps;
+ unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
- float sum0[2] = {0,0};
- float sum1[2] = {0,0};
- unsigned int i = 0;
+ float sum0[2] = { 0, 0 };
+ float sum1[2] = { 0, 0 };
+ unsigned int i = 0;
- for(i = 0; i < n_2_ccomplex_blocks; ++i) {
- sum0[0] += in[0] * tp[0] - in[1] * tp[1];
- sum0[1] += in[0] * tp[1] + in[1] * tp[0];
- sum1[0] += in[2] * tp[2] - in[3] * tp[3];
- sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+ for (i = 0; i < n_2_ccomplex_blocks; ++i) {
+ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
- in += 4;
- tp += 4;
- }
+ in += 4;
+ tp += 4;
+ }
- res[0] = sum0[0] + sum1[0];
- res[1] = sum0[1] + sum1[1];
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
- if (num_points & 1) {
- *result += input[num_points - 1] * taps[num_points - 1];
- }
+ if (num_points & 1) {
+ *result += input[num_points - 1] * taps[num_points - 1];
+ }
}
#endif /*LV_HAVE_GENERIC*/
#if LV_HAVE_SSE && LV_HAVE_64
-static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
-
- const unsigned int num_bytes = num_points*8;
- unsigned int isodd = num_points & 1;
-
- __VOLK_ASM
- (
- "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
- "# const float *taps, unsigned num_bytes)\n\t"
- "# float sum0 = 0;\n\t"
- "# float sum1 = 0;\n\t"
- "# float sum2 = 0;\n\t"
- "# float sum3 = 0;\n\t"
- "# do {\n\t"
- "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
- "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
- "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
- "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
- "# input += 4;\n\t"
- "# taps += 4; \n\t"
- "# } while (--n_2_ccomplex_blocks != 0);\n\t"
- "# result[0] = sum0 + sum2;\n\t"
- "# result[1] = sum1 + sum3;\n\t"
- "# TODO: prefetch and better scheduling\n\t"
- " xor %%r9, %%r9\n\t"
- " xor %%r10, %%r10\n\t"
- " movq %%rcx, %%rax\n\t"
- " movq %%rcx, %%r8\n\t"
- " movq %[rsi], %%r9\n\t"
- " movq %[rdx], %%r10\n\t"
- " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
- " movaps 0(%%r9), %%xmm0\n\t"
- " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
- " movaps 0(%%r10), %%xmm2\n\t"
- " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
- " shr $4, %%r8\n\t"
- " jmp .%=L1_test\n\t"
- " # 4 taps / loop\n\t"
- " # something like ?? cycles / loop\n\t"
- ".%=Loop1: \n\t"
- "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
- "# movaps (%%r9), %%xmmA\n\t"
- "# movaps (%%r10), %%xmmB\n\t"
- "# movaps %%xmmA, %%xmmZ\n\t"
- "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
- "# mulps %%xmmB, %%xmmA\n\t"
- "# mulps %%xmmZ, %%xmmB\n\t"
- "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
- "# xorps %%xmmPN, %%xmmA\n\t"
- "# movaps %%xmmA, %%xmmZ\n\t"
- "# unpcklps %%xmmB, %%xmmA\n\t"
- "# unpckhps %%xmmB, %%xmmZ\n\t"
- "# movaps %%xmmZ, %%xmmY\n\t"
- "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
- "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
- "# addps %%xmmZ, %%xmmA\n\t"
- "# addps %%xmmA, %%xmmC\n\t"
- "# A=xmm0, B=xmm2, Z=xmm4\n\t"
- "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
- " movaps 16(%%r9), %%xmm1\n\t"
- " movaps %%xmm0, %%xmm4\n\t"
- " mulps %%xmm2, %%xmm0\n\t"
- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
- " movaps 16(%%r10), %%xmm3\n\t"
- " movaps %%xmm1, %%xmm5\n\t"
- " addps %%xmm0, %%xmm6\n\t"
- " mulps %%xmm3, %%xmm1\n\t"
- " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
- " addps %%xmm1, %%xmm6\n\t"
- " mulps %%xmm4, %%xmm2\n\t"
- " movaps 32(%%r9), %%xmm0\n\t"
- " addps %%xmm2, %%xmm7\n\t"
- " mulps %%xmm5, %%xmm3\n\t"
- " add $32, %%r9\n\t"
- " movaps 32(%%r10), %%xmm2\n\t"
- " addps %%xmm3, %%xmm7\n\t"
- " add $32, %%r10\n\t"
- ".%=L1_test:\n\t"
- " dec %%rax\n\t"
- " jge .%=Loop1\n\t"
- " # We've handled the bulk of multiplies up to here.\n\t"
- " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
- " # If so, we've got 2 more taps to do.\n\t"
- " and $1, %%r8\n\t"
- " je .%=Leven\n\t"
- " # The count was odd, do 2 more taps.\n\t"
- " # Note that we've already got mm0/mm2 preloaded\n\t"
- " # from the main loop.\n\t"
- " movaps %%xmm0, %%xmm4\n\t"
- " mulps %%xmm2, %%xmm0\n\t"
- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
- " addps %%xmm0, %%xmm6\n\t"
- " mulps %%xmm4, %%xmm2\n\t"
- " addps %%xmm2, %%xmm7\n\t"
- ".%=Leven:\n\t"
- " # neg inversor\n\t"
- " xorps %%xmm1, %%xmm1\n\t"
- " mov $0x80000000, %%r9\n\t"
- " movd %%r9, %%xmm1\n\t"
- " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
- " # pfpnacc\n\t"
- " xorps %%xmm1, %%xmm6\n\t"
- " movaps %%xmm6, %%xmm2\n\t"
- " unpcklps %%xmm7, %%xmm6\n\t"
- " unpckhps %%xmm7, %%xmm2\n\t"
- " movaps %%xmm2, %%xmm3\n\t"
- " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
- " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
- " addps %%xmm2, %%xmm6\n\t"
- " # xmm6 = r1 i2 r3 i4\n\t"
- " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
- " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
- " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
- :
- :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
- :"rax", "r8", "r9", "r10"
- );
-
-
- if(isodd) {
- *result += input[num_points - 1] * taps[num_points - 1];
- }
-
- return;
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
+
+ const unsigned int num_bytes = num_points * 8;
+ unsigned int isodd = num_points & 1;
+
+ __VOLK_ASM(
+ "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
+ "# const float *taps, unsigned num_bytes)\n\t"
+ "# float sum0 = 0;\n\t"
+ "# float sum1 = 0;\n\t"
+ "# float sum2 = 0;\n\t"
+ "# float sum3 = 0;\n\t"
+ "# do {\n\t"
+ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+ "# input += 4;\n\t"
+ "# taps += 4; \n\t"
+ "# } while (--n_2_ccomplex_blocks != 0);\n\t"
+ "# result[0] = sum0 + sum2;\n\t"
+ "# result[1] = sum1 + sum3;\n\t"
+ "# TODO: prefetch and better scheduling\n\t"
+ " xor %%r9, %%r9\n\t"
+ " xor %%r10, %%r10\n\t"
+ " movq %%rcx, %%rax\n\t"
+ " movq %%rcx, %%r8\n\t"
+ " movq %[rsi], %%r9\n\t"
+ " movq %[rdx], %%r10\n\t"
+ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+ " movaps 0(%%r9), %%xmm0\n\t"
+ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+ " movaps 0(%%r10), %%xmm2\n\t"
+ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
+ " shr $4, %%r8\n\t"
+ " jmp .%=L1_test\n\t"
+ " # 4 taps / loop\n\t"
+ " # something like ?? cycles / loop\n\t"
+ ".%=Loop1: \n\t"
+ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+ "# movaps (%%r9), %%xmmA\n\t"
+ "# movaps (%%r10), %%xmmB\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+ "# mulps %%xmmB, %%xmmA\n\t"
+ "# mulps %%xmmZ, %%xmmB\n\t"
+ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+ "# xorps %%xmmPN, %%xmmA\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# unpcklps %%xmmB, %%xmmA\n\t"
+ "# unpckhps %%xmmB, %%xmmZ\n\t"
+ "# movaps %%xmmZ, %%xmmY\n\t"
+ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+ "# addps %%xmmZ, %%xmmA\n\t"
+ "# addps %%xmmA, %%xmmC\n\t"
+ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+ " movaps 16(%%r9), %%xmm1\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " movaps 16(%%r10), %%xmm3\n\t"
+ " movaps %%xmm1, %%xmm5\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm3, %%xmm1\n\t"
+ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+ " addps %%xmm1, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " movaps 32(%%r9), %%xmm0\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ " mulps %%xmm5, %%xmm3\n\t"
+ " add $32, %%r9\n\t"
+ " movaps 32(%%r10), %%xmm2\n\t"
+ " addps %%xmm3, %%xmm7\n\t"
+ " add $32, %%r10\n\t"
+ ".%=L1_test:\n\t"
+ " dec %%rax\n\t"
+ " jge .%=Loop1\n\t"
+ " # We've handled the bulk of multiplies up to here.\n\t"
+ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+ " # If so, we've got 2 more taps to do.\n\t"
+ " and $1, %%r8\n\t"
+ " je .%=Leven\n\t"
+ " # The count was odd, do 2 more taps.\n\t"
+ " # Note that we've already got mm0/mm2 preloaded\n\t"
+ " # from the main loop.\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ ".%=Leven:\n\t"
+ " # neg inversor\n\t"
+ " xorps %%xmm1, %%xmm1\n\t"
+ " mov $0x80000000, %%r9\n\t"
+ " movd %%r9, %%xmm1\n\t"
+ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+ " # pfpnacc\n\t"
+ " xorps %%xmm1, %%xmm6\n\t"
+ " movaps %%xmm6, %%xmm2\n\t"
+ " unpcklps %%xmm7, %%xmm6\n\t"
+ " unpckhps %%xmm7, %%xmm2\n\t"
+ " movaps %%xmm2, %%xmm3\n\t"
+ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+ " addps %%xmm2, %%xmm6\n\t"
+ " # xmm6 = r1 i2 r3 i4\n\t"
+ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) "
+ "to memory\n\t"
+ :
+ : [rsi] "r"(input), [rdx] "r"(taps), "c"(num_bytes), [rdi] "r"(result)
+ : "rax", "r8", "r9", "r10");
+
+
+ if (isodd) {
+ *result += input[num_points - 1] * taps[num_points - 1];
+ }
+ return;
}
#endif
#if LV_HAVE_SSE && LV_HAVE_32
-static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
- volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points);
+ volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points);
#if 0
const unsigned int num_bytes = num_points*8;
#include <pmmintrin.h>
-static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
- const unsigned int num_bytes = num_points*8;
- unsigned int isodd = num_points & 1;
+ const unsigned int num_bytes = num_points * 8;
+ unsigned int isodd = num_points & 1;
- lv_32fc_t dotProduct;
- memset(&dotProduct, 0x0, 2*sizeof(float));
+ lv_32fc_t dotProduct;
+ memset(&dotProduct, 0x0, 2 * sizeof(float));
- unsigned int number = 0;
- const unsigned int halfPoints = num_bytes >> 4;
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_bytes >> 4;
- __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
- const lv_32fc_t* a = input;
- const lv_32fc_t* b = taps;
+ const lv_32fc_t* a = input;
+ const lv_32fc_t* b = taps;
- dotProdVal = _mm_setzero_ps();
+ dotProdVal = _mm_setzero_ps();
- for(;number < halfPoints; number++){
+ for (; number < halfPoints; number++) {
- x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
- y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
- yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
- yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
- tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
- x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+ x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
- tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+ tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
- z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z = _mm_addsub_ps(tmp1,
+ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
- dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+ dotProdVal =
+ _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
- a += 2;
- b += 2;
- }
+ a += 2;
+ b += 2;
+ }
- __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
- _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+ _mm_store_ps((float*)dotProductVector,
+ dotProdVal); // Store the results back into the dot product vector
- dotProduct += ( dotProductVector[0] + dotProductVector[1] );
+ dotProduct += (dotProductVector[0] + dotProductVector[1]);
- if(isodd) {
- dotProduct += input[num_points - 1] * taps[num_points - 1];
- }
+ if (isodd) {
+ dotProduct += input[num_points - 1] * taps[num_points - 1];
+ }
- *result = dotProduct;
+ *result = dotProduct;
}
#endif /*LV_HAVE_SSE3*/
#include <smmintrin.h>
-static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
- unsigned int i = 0;
- const unsigned int qtr_points = num_points/4;
- const unsigned int isodd = num_points & 3;
+ unsigned int i = 0;
+ const unsigned int qtr_points = num_points / 4;
+ const unsigned int isodd = num_points & 3;
- __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
- float *p_input, *p_taps;
- __m64 *p_result;
+ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
+ float *p_input, *p_taps;
+ __m64* p_result;
- static const __m128i neg = {0x000000000000000080000000};
+ static const __m128i neg = { 0x000000000000000080000000 };
- p_result = (__m64*)result;
- p_input = (float*)input;
- p_taps = (float*)taps;
+ p_result = (__m64*)result;
+ p_input = (float*)input;
+ p_taps = (float*)taps;
- real0 = _mm_setzero_ps();
- real1 = _mm_setzero_ps();
- im0 = _mm_setzero_ps();
- im1 = _mm_setzero_ps();
+ real0 = _mm_setzero_ps();
+ real1 = _mm_setzero_ps();
+ im0 = _mm_setzero_ps();
+ im1 = _mm_setzero_ps();
- for(; i < qtr_points; ++i) {
- xmm0 = _mm_load_ps(p_input);
- xmm1 = _mm_load_ps(p_taps);
+ for (; i < qtr_points; ++i) {
+ xmm0 = _mm_load_ps(p_input);
+ xmm1 = _mm_load_ps(p_taps);
- p_input += 4;
- p_taps += 4;
+ p_input += 4;
+ p_taps += 4;
- xmm2 = _mm_load_ps(p_input);
- xmm3 = _mm_load_ps(p_taps);
+ xmm2 = _mm_load_ps(p_input);
+ xmm3 = _mm_load_ps(p_taps);
- p_input += 4;
- p_taps += 4;
+ p_input += 4;
+ p_taps += 4;
- xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
- xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
- xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
- xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
+ xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
+ xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
+ xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
+ xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
- //imaginary vector from input
- xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
- //real vector from input
- xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
- //imaginary vector from taps
- xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
- //real vector from taps
- xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
+ // imaginary vector from input
+ xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
+ // real vector from input
+ xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
+ // imaginary vector from taps
+ xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
+ // real vector from taps
+ xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
- xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
- xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
+ xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
+ xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
- xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
- xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
+ xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
+ xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
- real0 = _mm_add_ps(xmm4, real0);
- real1 = _mm_add_ps(xmm5, real1);
- im0 = _mm_add_ps(xmm6, im0);
- im1 = _mm_add_ps(xmm7, im1);
- }
+ real0 = _mm_add_ps(xmm4, real0);
+ real1 = _mm_add_ps(xmm5, real1);
+ im0 = _mm_add_ps(xmm6, im0);
+ im1 = _mm_add_ps(xmm7, im1);
+ }
- real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
+ real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
- im0 = _mm_add_ps(im0, im1);
- real0 = _mm_add_ps(real0, real1);
+ im0 = _mm_add_ps(im0, im1);
+ real0 = _mm_add_ps(real0, real1);
- im0 = _mm_add_ps(im0, real0);
+ im0 = _mm_add_ps(im0, real0);
- _mm_storel_pi(p_result, im0);
+ _mm_storel_pi(p_result, im0);
- for(i = num_points-isodd; i < num_points; i++) {
- *result += input[i] * taps[i];
- }
+ for (i = num_points - isodd; i < num_points; i++) {
+ *result += input[i] * taps[i];
+ }
}
#endif /*LV_HAVE_SSE4_1*/
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
unsigned int quarter_points = num_points / 4;
unsigned int number;
- lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
- lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+ lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
+ lv_32fc_t* b_ptr = (lv_32fc_t*)input;
// for 2-lane vectors, 1st lane holds the real part,
// 2nd lane holds the imaginary part
float32x4x2_t a_val, b_val, c_val, accumulator;
accumulator.val[0] = vdupq_n_f32(0);
accumulator.val[1] = vdupq_n_f32(0);
- for(number = 0; number < quarter_points; ++number) {
+ for (number = 0; number < quarter_points; ++number) {
a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
- __VOLK_PREFETCH(a_ptr+8);
- __VOLK_PREFETCH(b_ptr+8);
+ __VOLK_PREFETCH(a_ptr + 8);
+ __VOLK_PREFETCH(b_ptr + 8);
// multiply the real*real and imag*imag to get real result
// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
*result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
// tail case
- for(number = quarter_points*4; number < num_points; ++number) {
+ for (number = quarter_points * 4; number < num_points; ++number) {
*result += (*a_ptr++) * (*b_ptr++);
}
-
}
#endif /*LV_HAVE_NEON*/
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
unsigned int quarter_points = num_points / 4;
unsigned int number;
- lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
- lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+ lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
+ lv_32fc_t* b_ptr = (lv_32fc_t*)input;
// for 2-lane vectors, 1st lane holds the real part,
// 2nd lane holds the imaginary part
float32x4x2_t a_val, b_val, accumulator;
accumulator.val[0] = vdupq_n_f32(0);
accumulator.val[1] = vdupq_n_f32(0);
- for(number = 0; number < quarter_points; ++number) {
+ for (number = 0; number < quarter_points; ++number) {
a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
- __VOLK_PREFETCH(a_ptr+8);
- __VOLK_PREFETCH(b_ptr+8);
+ __VOLK_PREFETCH(a_ptr + 8);
+ __VOLK_PREFETCH(b_ptr + 8);
// do the first multiply
tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
*result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
// tail case
- for(number = quarter_points*4; number < num_points; ++number) {
+ for (number = quarter_points * 4; number < num_points; ++number) {
*result += (*a_ptr++) * (*b_ptr++);
}
-
}
#endif /*LV_HAVE_NEON*/
#ifdef LV_HAVE_NEON
-static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
unsigned int quarter_points = num_points / 4;
unsigned int number;
- lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
- lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+ lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
+ lv_32fc_t* b_ptr = (lv_32fc_t*)input;
// for 2-lane vectors, 1st lane holds the real part,
// 2nd lane holds the imaginary part
float32x4x2_t a_val, b_val, accumulator1, accumulator2;
accumulator2.val[0] = vdupq_n_f32(0);
accumulator2.val[1] = vdupq_n_f32(0);
- for(number = 0; number < quarter_points; ++number) {
+ for (number = 0; number < quarter_points; ++number) {
a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
- __VOLK_PREFETCH(a_ptr+8);
- __VOLK_PREFETCH(b_ptr+8);
+ __VOLK_PREFETCH(a_ptr + 8);
+ __VOLK_PREFETCH(b_ptr + 8);
// use 2 accumulators to remove inter-instruction data dependencies
accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
*result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
// tail case
- for(number = quarter_points*4; number < num_points; ++number) {
+ for (number = quarter_points * 4; number < num_points; ++number) {
*result += (*a_ptr++) * (*b_ptr++);
}
-
}
#endif /*LV_HAVE_NEON*/
#ifdef LV_HAVE_NEON
-static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
-// NOTE: GCC does a poor job with this kernel, but the equivalent ASM code is very fast
+static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
+ // NOTE: GCC does a poor job with this kernel, but the equivalent ASM code is very
+ // fast
unsigned int quarter_points = num_points / 8;
unsigned int number;
- lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
- lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+ lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
+ lv_32fc_t* b_ptr = (lv_32fc_t*)input;
// for 2-lane vectors, 1st lane holds the real part,
// 2nd lane holds the imaginary part
float32x4x4_t a_val, b_val, accumulator1, accumulator2;
accumulator2.val[3] = vdupq_n_f32(0);
// 8 input regs, 8 accumulators -> 16/16 neon regs are used
- for(number = 0; number < quarter_points; ++number) {
+ for (number = 0; number < quarter_points; ++number) {
a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
- __VOLK_PREFETCH(a_ptr+8);
- __VOLK_PREFETCH(b_ptr+8);
+ __VOLK_PREFETCH(a_ptr + 8);
+ __VOLK_PREFETCH(b_ptr + 8);
// use 2 accumulators to remove inter-instruction data dependencies
accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
*result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
// tail case
- for(number = quarter_points*8; number < num_points; ++number) {
+ for (number = quarter_points * 8; number < num_points; ++number) {
*result += (*a_ptr++) * (*b_ptr++);
}
-
}
#endif /*LV_HAVE_NEON*/
#include <immintrin.h>
-static inline void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
- unsigned int isodd = num_points & 3;
- unsigned int i = 0;
- lv_32fc_t dotProduct;
- memset(&dotProduct, 0x0, 2*sizeof(float));
+ unsigned int isodd = num_points & 3;
+ unsigned int i = 0;
+ lv_32fc_t dotProduct;
+ memset(&dotProduct, 0x0, 2 * sizeof(float));
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+ __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
- const lv_32fc_t* a = input;
- const lv_32fc_t* b = taps;
+ const lv_32fc_t* a = input;
+ const lv_32fc_t* b = taps;
- dotProdVal = _mm256_setzero_ps();
+ dotProdVal = _mm256_setzero_ps();
- for(;number < quarterPoints; number++){
+ for (; number < quarterPoints; number++) {
- x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
- y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
+ x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
+ y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
- yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
- yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
- tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+ tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
- x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
+ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
- tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
- z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z = _mm256_addsub_ps(tmp1,
+ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
- dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex multiplication results together
+ dotProdVal = _mm256_add_ps(dotProdVal,
+ z); // Add the complex multiplication results together
- a += 4;
- b += 4;
- }
+ a += 4;
+ b += 4;
+ }
- __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
- _mm256_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+ _mm256_store_ps((float*)dotProductVector,
+ dotProdVal); // Store the results back into the dot product vector
- dotProduct += ( dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + dotProductVector[3]);
+ dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
+ dotProductVector[3]);
- for(i = num_points-isodd; i < num_points; i++) {
- dotProduct += input[i] * taps[i];
- }
+ for (i = num_points - isodd; i < num_points; i++) {
+ dotProduct += input[i] * taps[i];
+ }
- *result = dotProduct;
+ *result = dotProduct;
}
#endif /*LV_HAVE_AVX*/
#if LV_HAVE_AVX && LV_HAVE_FMA
#include <immintrin.h>
-static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
- unsigned int isodd = num_points & 3;
- unsigned int i = 0;
- lv_32fc_t dotProduct;
- memset(&dotProduct, 0x0, 2*sizeof(float));
+ unsigned int isodd = num_points & 3;
+ unsigned int i = 0;
+ lv_32fc_t dotProduct;
+ memset(&dotProduct, 0x0, 2 * sizeof(float));
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+ __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
- const lv_32fc_t* a = input;
- const lv_32fc_t* b = taps;
+ const lv_32fc_t* a = input;
+ const lv_32fc_t* b = taps;
- dotProdVal = _mm256_setzero_ps();
+ dotProdVal = _mm256_setzero_ps();
- for(;number < quarterPoints; number++){
+ for (; number < quarterPoints; number++) {
- x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
- y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
+ x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
+ y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
- yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
- yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
- tmp1 = x;
+ tmp1 = x;
- x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
+ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
- tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
- z = _mm256_fmaddsub_ps(tmp1, yl,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z = _mm256_fmaddsub_ps(
+ tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
- dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex multiplication results together
+ dotProdVal = _mm256_add_ps(dotProdVal,
+ z); // Add the complex multiplication results together
- a += 4;
- b += 4;
- }
+ a += 4;
+ b += 4;
+ }
- __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
- _mm256_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+ _mm256_store_ps((float*)dotProductVector,
+ dotProdVal); // Store the results back into the dot product vector
- dotProduct += ( dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + dotProductVector[3]);
+ dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
+ dotProductVector[3]);
- for(i = num_points-isodd; i < num_points; i++) {
- dotProduct += input[i] * taps[i];
- }
+ for (i = num_points - isodd; i < num_points; i++) {
+ dotProduct += input[i] * taps[i];
+ }
- *result = dotProduct;
+ *result = dotProduct;
}
#endif /*LV_HAVE_AVX && LV_HAVE_FMA*/
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_x2_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
- * \endcode
+ * void volk_32fc_x2_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
+ * lv_32fc_t* bVector, unsigned int num_points); \endcode
*
* \b Inputs
* \li aVector: The first input vector of complex floats.
#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
#define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
+#include <float.h>
#include <inttypes.h>
#include <stdio.h>
#include <volk/volk_complex.h>
-#include <float.h>
#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include <immintrin.h>
/*!
- \brief Multiplies the two input complex vectors and stores their results in the third vector
- \param cVector The vector where the results will be stored
- \param aVector One of the vectors to be multiplied
- \param bVector One of the vectors to be multiplied
- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ \brief Multiplies the two input complex vectors and stores their results in the third
+ vector \param cVector The vector where the results will be stored \param aVector One of
+ the vectors to be multiplied \param bVector One of the vectors to be multiplied \param
+ num_points The number of complex values in aVector and bVector to be multiplied together
+ and stored into cVector
*/
-static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
- const lv_32fc_t* b = bVector;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
- for(;number < quarterPoints; number++){
+ for (; number < quarterPoints; number++) {
- const __m256 x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
- const __m256 y = _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+ const __m256 x =
+ _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ const __m256 y =
+ _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
- const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
- const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+ const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
- const __m256 tmp2x = _mm256_permute_ps(x,0xB1); // Re-arrange x to be ai,ar,bi,br
+ const __m256 tmp2x = _mm256_permute_ps(x, 0xB1); // Re-arrange x to be ai,ar,bi,br
- const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+ const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
- const __m256 z = _mm256_fmaddsub_ps(x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ const __m256 z = _mm256_fmaddsub_ps(
+ x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
- _mm256_storeu_ps((float*)c,z); // Store the results back into the C container
+ _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
- a += 4;
- b += 4;
- c += 4;
- }
+ a += 4;
+ b += 4;
+ c += 4;
+ }
- _mm256_zeroupper();
+ _mm256_zeroupper();
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *c++ = (*a++) * (*b++);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *c++ = (*a++) * (*b++);
+ }
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
#include <immintrin.h>
#include <volk/volk_avx_intrinsics.h>
-static inline void
-volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- __m256 x, y, z;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
- const lv_32fc_t* b = bVector;
-
- for(; number < quarterPoints; number++){
- x = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
- y = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
- z = _mm256_complexmul_ps(x, y);
- _mm256_storeu_ps((float*) c, z); // Store the results back into the C container
-
- a += 4;
- b += 4;
- c += 4;
- }
-
- number = quarterPoints * 4;
-
- for(; number < num_points; number++){
- *c++ = (*a++) * (*b++);
- }
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m256 x, y, z;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ for (; number < quarterPoints; number++) {
+ x = _mm256_loadu_ps(
+ (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+ y = _mm256_loadu_ps(
+ (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+ z = _mm256_complexmul_ps(x, y);
+ _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
+
+ a += 4;
+ b += 4;
+ c += 4;
+ }
+
+ number = quarterPoints * 4;
+
+ for (; number < num_points; number++) {
+ *c++ = (*a++) * (*b++);
+ }
}
#endif /* LV_HAVE_AVX */
#include <pmmintrin.h>
#include <volk/volk_sse3_intrinsics.h>
-static inline void
-volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int halfPoints = num_points / 2;
-
- __m128 x, y, z;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
- const lv_32fc_t* b = bVector;
-
- for(; number < halfPoints; number++){
- x = _mm_loadu_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi
- y = _mm_loadu_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di
- z = _mm_complexmul_ps(x, y);
- _mm_storeu_ps((float*) c, z); // Store the results back into the C container
-
- a += 2;
- b += 2;
- c += 2;
- }
-
- if((num_points % 2) != 0){
- *c = (*a) * (*b);
- }
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, z;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ for (; number < halfPoints; number++) {
+ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+ z = _mm_complexmul_ps(x, y);
+ _mm_storeu_ps((float*)c, z); // Store the results back into the C container
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+ if ((num_points % 2) != 0) {
+ *c = (*a) * (*b);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- const lv_32fc_t* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++) * (*bPtr++);
- }
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H
#define INCLUDED_volk_32fc_x2_multiply_32fc_a_H
+#include <float.h>
#include <inttypes.h>
#include <stdio.h>
#include <volk/volk_complex.h>
-#include <float.h>
#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include <immintrin.h>
/*!
- \brief Multiplies the two input complex vectors and stores their results in the third vector
- \param cVector The vector where the results will be stored
- \param aVector One of the vectors to be multiplied
- \param bVector One of the vectors to be multiplied
- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ \brief Multiplies the two input complex vectors and stores their results in the third
+ vector \param cVector The vector where the results will be stored \param aVector One of
+ the vectors to be multiplied \param bVector One of the vectors to be multiplied \param
+ num_points The number of complex values in aVector and bVector to be multiplied together
+ and stored into cVector
*/
-static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
- const lv_32fc_t* b = bVector;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
- for(;number < quarterPoints; number++){
+ for (; number < quarterPoints; number++) {
- const __m256 x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
- const __m256 y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+ const __m256 x =
+ _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ const __m256 y =
+ _mm256_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
- const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
- const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+ const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
- const __m256 tmp2x = _mm256_permute_ps(x,0xB1); // Re-arrange x to be ai,ar,bi,br
+ const __m256 tmp2x = _mm256_permute_ps(x, 0xB1); // Re-arrange x to be ai,ar,bi,br
- const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+ const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
- const __m256 z = _mm256_fmaddsub_ps(x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ const __m256 z = _mm256_fmaddsub_ps(
+ x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
- _mm256_store_ps((float*)c,z); // Store the results back into the C container
+ _mm256_store_ps((float*)c, z); // Store the results back into the C container
- a += 4;
- b += 4;
- c += 4;
- }
+ a += 4;
+ b += 4;
+ c += 4;
+ }
- _mm256_zeroupper();
+ _mm256_zeroupper();
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *c++ = (*a++) * (*b++);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *c++ = (*a++) * (*b++);
+ }
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
#include <immintrin.h>
#include <volk/volk_avx_intrinsics.h>
-static inline void
-volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- __m256 x, y, z;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
- const lv_32fc_t* b = bVector;
-
- for(; number < quarterPoints; number++){
- x = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
- y = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
- z = _mm256_complexmul_ps(x, y);
- _mm256_store_ps((float*) c, z); // Store the results back into the C container
-
- a += 4;
- b += 4;
- c += 4;
- }
-
- number = quarterPoints * 4;
-
- for(; number < num_points; number++){
- *c++ = (*a++) * (*b++);
- }
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m256 x, y, z;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ for (; number < quarterPoints; number++) {
+ x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+ y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+ z = _mm256_complexmul_ps(x, y);
+ _mm256_store_ps((float*)c, z); // Store the results back into the C container
+
+ a += 4;
+ b += 4;
+ c += 4;
+ }
+
+ number = quarterPoints * 4;
+
+ for (; number < num_points; number++) {
+ *c++ = (*a++) * (*b++);
+ }
}
#endif /* LV_HAVE_AVX */
#include <pmmintrin.h>
#include <volk/volk_sse3_intrinsics.h>
-static inline void
-volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int halfPoints = num_points / 2;
-
- __m128 x, y, z;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
- const lv_32fc_t* b = bVector;
-
- for(; number < halfPoints; number++){
- x = _mm_load_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi
- y = _mm_load_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di
- z = _mm_complexmul_ps(x, y);
- _mm_store_ps((float*) c, z); // Store the results back into the C container
-
- a += 2;
- b += 2;
- c += 2;
- }
-
- if((num_points % 2) != 0){
- *c = (*a) * (*b);
- }
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, z;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ for (; number < halfPoints; number++) {
+ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+ z = _mm_complexmul_ps(x, y);
+ _mm_store_ps((float*)c, z); // Store the results back into the C container
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+ if ((num_points % 2) != 0) {
+ *c = (*a) * (*b);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- const lv_32fc_t* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++) * (*bPtr++);
- }
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
- lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
- unsigned int quarter_points = num_points / 4;
- float32x4x2_t a_val, b_val, c_val;
- float32x4x2_t tmp_real, tmp_imag;
- unsigned int number = 0;
-
- for(number = 0; number < quarter_points; ++number) {
- a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
- b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
- __VOLK_PREFETCH(a_ptr+4);
- __VOLK_PREFETCH(b_ptr+4);
-
- // multiply the real*real and imag*imag to get real result
- // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
- tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
- // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
- tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
-
- // Multiply cross terms to get the imaginary result
- // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
- tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
- // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
- tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
-
- // store the results
- c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
- c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
- vst2q_f32((float*)cVector, c_val);
-
- a_ptr += 4;
- b_ptr += 4;
- cVector += 4;
- }
-
- for(number = quarter_points*4; number < num_points; number++){
- *cVector++ = (*a_ptr++) * (*b_ptr++);
- }
+ lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
+ lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
+ unsigned int quarter_points = num_points / 4;
+ float32x4x2_t a_val, b_val, c_val;
+ float32x4x2_t tmp_real, tmp_imag;
+ unsigned int number = 0;
+
+ for (number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ __VOLK_PREFETCH(a_ptr + 4);
+ __VOLK_PREFETCH(b_ptr + 4);
+
+ // multiply the real*real and imag*imag to get real result
+ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+ tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+ // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+ tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
+
+ // Multiply cross terms to get the imaginary result
+ // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+ tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
+ // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+
+ // store the results
+ c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
+ c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
+ vst2q_f32((float*)cVector, c_val);
+
+ a_ptr += 4;
+ b_ptr += 4;
+ cVector += 4;
+ }
+
+ for (number = quarter_points * 4; number < num_points; number++) {
+ *cVector++ = (*a_ptr++) * (*b_ptr++);
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_NEON
-static inline void
-volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
- lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
- unsigned int quarter_points = num_points / 4;
- float32x4x2_t a_val, b_val;
- float32x4x2_t tmp_imag;
- unsigned int number = 0;
-
- for(number = 0; number < quarter_points; ++number) {
- a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
- b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
- __VOLK_PREFETCH(a_ptr+4);
- __VOLK_PREFETCH(b_ptr+4);
-
- // do the first multiply
- tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
- tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
-
- // use multiply accumulate/subtract to get result
- tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
- tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
-
- // store
- vst2q_f32((float*)cVector, tmp_imag);
- // increment pointers
- a_ptr += 4;
- b_ptr += 4;
- cVector += 4;
- }
-
- for(number = quarter_points*4; number < num_points; number++){
- *cVector++ = (*a_ptr++) * (*b_ptr++);
- }
+ lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
+ lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
+ unsigned int quarter_points = num_points / 4;
+ float32x4x2_t a_val, b_val;
+ float32x4x2_t tmp_imag;
+ unsigned int number = 0;
+
+ for (number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ __VOLK_PREFETCH(a_ptr + 4);
+ __VOLK_PREFETCH(b_ptr + 4);
+
+ // do the first multiply
+ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+ tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+
+ // use multiply accumulate/subtract to get result
+ tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
+ tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
+
+ // store
+ vst2q_f32((float*)cVector, tmp_imag);
+ // increment pointers
+ a_ptr += 4;
+ b_ptr += 4;
+ cVector += 4;
+ }
+
+ for (number = quarter_points * 4; number < num_points; number++) {
+ *cVector++ = (*a_ptr++) * (*b_ptr++);
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_NEONV7
-extern void
-volk_32fc_x2_multiply_32fc_a_neonasm(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points);
+extern void volk_32fc_x2_multiply_32fc_a_neonasm(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points);
#endif /* LV_HAVE_NEONV7 */
#ifdef LV_HAVE_ORC
-extern void
-volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points);
+extern void volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points);
-static inline void
-volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
+ volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_ORC */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_x2_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
- * \endcode
+ * void volk_32fc_x2_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ * const lv_32fc_t* bVector, unsigned int num_points); \endcode
*
* \b Inputs
* \li aVector: The first input vector of complex floats.
#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
+#include <float.h>
#include <inttypes.h>
#include <stdio.h>
#include <volk/volk_complex.h>
-#include <float.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
#include <volk/volk_avx_intrinsics.h>
-static inline void
-volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- __m256 x, y, z;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
- const lv_32fc_t* b = bVector;
-
- for(; number < quarterPoints; number++){
- x = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
- y = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
- z = _mm256_complexconjugatemul_ps(x, y);
- _mm256_storeu_ps((float*) c, z); // Store the results back into the C container
-
- a += 4;
- b += 4;
- c += 4;
- }
-
- number = quarterPoints * 4;
-
- for(; number < num_points; number++){
- *c++ = (*a++) * lv_conj(*b++);
- }
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m256 x, y, z;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ for (; number < quarterPoints; number++) {
+ x = _mm256_loadu_ps(
+ (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+ y = _mm256_loadu_ps(
+ (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+ z = _mm256_complexconjugatemul_ps(x, y);
+ _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
+
+ a += 4;
+ b += 4;
+ c += 4;
+ }
+
+ number = quarterPoints * 4;
+
+ for (; number < num_points; number++) {
+ *c++ = (*a++) * lv_conj(*b++);
+ }
}
#endif /* LV_HAVE_AVX */
#include <pmmintrin.h>
#include <volk/volk_sse3_intrinsics.h>
-static inline void
-volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int halfPoints = num_points / 2;
-
- __m128 x, y, z;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
- const lv_32fc_t* b = bVector;
-
- for(; number < halfPoints; number++){
- x = _mm_loadu_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi
- y = _mm_loadu_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di
- z = _mm_complexconjugatemul_ps(x, y);
- _mm_storeu_ps((float*) c, z); // Store the results back into the C container
-
- a += 2;
- b += 2;
- c += 2;
- }
-
- if((num_points % 2) != 0){
- *c = (*a) * lv_conj(*b);
- }
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, z;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ for (; number < halfPoints; number++) {
+ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+ z = _mm_complexconjugatemul_ps(x, y);
+ _mm_storeu_ps((float*)c, z); // Store the results back into the C container
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+ if ((num_points % 2) != 0) {
+ *c = (*a) * lv_conj(*b);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- const lv_32fc_t* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
- }
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
-
#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */
#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
+#include <float.h>
#include <inttypes.h>
#include <stdio.h>
#include <volk/volk_complex.h>
-#include <float.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
#include <volk/volk_avx_intrinsics.h>
-static inline void
-volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- __m256 x, y, z;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
- const lv_32fc_t* b = bVector;
-
- for(; number < quarterPoints; number++){
- x = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
- y = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
- z = _mm256_complexconjugatemul_ps(x, y);
- _mm256_store_ps((float*) c, z); // Store the results back into the C container
-
- a += 4;
- b += 4;
- c += 4;
- }
-
- number = quarterPoints * 4;
-
- for(; number < num_points; number++){
- *c++ = (*a++) * lv_conj(*b++);
- }
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m256 x, y, z;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ for (; number < quarterPoints; number++) {
+ x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+ y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+ z = _mm256_complexconjugatemul_ps(x, y);
+ _mm256_store_ps((float*)c, z); // Store the results back into the C container
+
+ a += 4;
+ b += 4;
+ c += 4;
+ }
+
+ number = quarterPoints * 4;
+
+ for (; number < num_points; number++) {
+ *c++ = (*a++) * lv_conj(*b++);
+ }
}
#endif /* LV_HAVE_AVX */
#include <pmmintrin.h>
#include <volk/volk_sse3_intrinsics.h>
-static inline void
-volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int halfPoints = num_points / 2;
-
- __m128 x, y, z;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
- const lv_32fc_t* b = bVector;
-
- for(; number < halfPoints; number++){
- x = _mm_load_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi
- y = _mm_load_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di
- z = _mm_complexconjugatemul_ps(x, y);
- _mm_store_ps((float*) c, z); // Store the results back into the C container
-
- a += 2;
- b += 2;
- c += 2;
- }
-
- if((num_points % 2) != 0){
- *c = (*a) * lv_conj(*b);
- }
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, z;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ for (; number < halfPoints; number++) {
+ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+ z = _mm_complexconjugatemul_ps(x, y);
+ _mm_store_ps((float*)c, z); // Store the results back into the C container
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+ if ((num_points % 2) != 0) {
+ *c = (*a) * lv_conj(*b);
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points)
+static inline void volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
- lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
- unsigned int quarter_points = num_points / 4;
- float32x4x2_t a_val, b_val, c_val;
- float32x4x2_t tmp_real, tmp_imag;
- unsigned int number = 0;
-
- for(number = 0; number < quarter_points; ++number) {
- a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
- b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
- b_val.val[1] = vnegq_f32(b_val.val[1]);
- __VOLK_PREFETCH(a_ptr+4);
- __VOLK_PREFETCH(b_ptr+4);
-
- // multiply the real*real and imag*imag to get real result
- // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
- tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
- // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
- tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
-
- // Multiply cross terms to get the imaginary result
+ lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
+ lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
+ unsigned int quarter_points = num_points / 4;
+ float32x4x2_t a_val, b_val, c_val;
+ float32x4x2_t tmp_real, tmp_imag;
+ unsigned int number = 0;
+
+ for (number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ b_val.val[1] = vnegq_f32(b_val.val[1]);
+ __VOLK_PREFETCH(a_ptr + 4);
+ __VOLK_PREFETCH(b_ptr + 4);
+
+ // multiply the real*real and imag*imag to get real result
+ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+ tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+ // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+ tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
+
+ // Multiply cross terms to get the imaginary result
// a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
- tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
- // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
- tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
-
- // store the results
- c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
- c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
- vst2q_f32((float*)cVector, c_val);
-
- a_ptr += 4;
- b_ptr += 4;
- cVector += 4;
+ tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
+ // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+
+ // store the results
+ c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
+ c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
+ vst2q_f32((float*)cVector, c_val);
+
+ a_ptr += 4;
+ b_ptr += 4;
+ cVector += 4;
}
- for(number = quarter_points*4; number < num_points; number++){
- *cVector++ = (*a_ptr++) * conj(*b_ptr++);
- }
+ for (number = quarter_points * 4; number < num_points; number++) {
+ *cVector++ = (*a_ptr++) * conj(*b_ptr++);
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
static inline void
-volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
- const lv_32fc_t* bVector, unsigned int num_points)
+volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
{
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- const lv_32fc_t* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
- }
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_x2_s32f_square_dist_scalar_mult_32f(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points)
- * \endcode
+ * void volk_32fc_x2_s32f_square_dist_scalar_mult_32f(float* target, lv_32fc_t* src0,
+ * lv_32fc_t* points, float scalar, unsigned int num_points) \endcode
*
* \b Inputs
* \li src0: The complex input. Only the first point is used.
#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
-#include<volk/volk_complex.h>
+#include <volk/volk_complex.h>
-static inline void
-calculate_scaled_distances(float* target, const lv_32fc_t symbol, const lv_32fc_t* points,
- const float scalar, const unsigned int num_points)
+static inline void calculate_scaled_distances(float* target,
+ const lv_32fc_t symbol,
+ const lv_32fc_t* points,
+ const float scalar,
+ const unsigned int num_points)
{
- lv_32fc_t diff;
- for(unsigned int i = 0; i < num_points; ++i) {
- /*
- * Calculate: |y - x|^2 * SNR_lin
- * Compare C++: *target++ = scalar * std::norm(symbol - *constellation++);
- */
- diff = symbol - *points++;
- *target++ = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
- }
+ lv_32fc_t diff;
+ for (unsigned int i = 0; i < num_points; ++i) {
+ /*
+ * Calculate: |y - x|^2 * SNR_lin
+ * Compare C++: *target++ = scalar * std::norm(symbol - *constellation++);
+ */
+ diff = symbol - *points++;
+ *target++ =
+ scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
+ }
}
#ifdef LV_HAVE_AVX2
-#include<immintrin.h>
-#include<volk/volk_avx2_intrinsics.h>
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
static inline void
-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target, lv_32fc_t* src0,
- lv_32fc_t* points, float scalar,
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target,
+ lv_32fc_t* src0,
+ lv_32fc_t* points,
+ float scalar,
unsigned int num_points)
{
- const unsigned int num_bytes = num_points*8;
- __m128 xmm9, xmm10;
- __m256 xmm4, xmm6;
- __m256 xmm_points0, xmm_points1, xmm_result;
+ const unsigned int num_bytes = num_points * 8;
+ __m128 xmm9, xmm10;
+ __m256 xmm4, xmm6;
+ __m256 xmm_points0, xmm_points1, xmm_result;
- const unsigned int bound = num_bytes >> 6;
-
- // load complex value into all parts of the register.
- const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
- const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
-
- // Load scalar into all 8 parts of the register
- const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
- const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
+ const unsigned int bound = num_bytes >> 6;
- // Set permutation constant
- const __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
-
- for(unsigned int i = 0; i < bound; ++i) {
- xmm_points0 = _mm256_load_ps((float*)points);
- xmm_points1 = _mm256_load_ps((float*)(points + 4));
- points += 8;
- __VOLK_PREFETCH(points);
+ // load complex value into all parts of the register.
+ const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
+ const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
- xmm_result = _mm256_scaled_norm_dist_ps_avx2(xmm_symbol, xmm_symbol,
- xmm_points0, xmm_points1,
- xmm_scalar);
-
- _mm256_store_ps(target, xmm_result);
- target += 8;
- }
+ // Load scalar into all 8 parts of the register
+ const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
+ const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
- if (num_bytes >> 5 & 1) {
- xmm_points0 = _mm256_load_ps((float*)points);
+ // Set permutation constant
+ const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
- xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
+ for (unsigned int i = 0; i < bound; ++i) {
+ xmm_points0 = _mm256_load_ps((float*)points);
+ xmm_points1 = _mm256_load_ps((float*)(points + 4));
+ points += 8;
+ __VOLK_PREFETCH(points);
- points += 4;
+ xmm_result = _mm256_scaled_norm_dist_ps_avx2(
+ xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
- xmm6 = _mm256_mul_ps(xmm4, xmm4);
+ _mm256_store_ps(target, xmm_result);
+ target += 8;
+ }
- xmm4 = _mm256_hadd_ps(xmm6, xmm6);
- xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+ if (num_bytes >> 5 & 1) {
+ xmm_points0 = _mm256_load_ps((float*)points);
- xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
+ xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
- xmm9 = _mm256_extractf128_ps(xmm_result, 1);
- _mm_store_ps(target,xmm9);
- target += 4;
- }
+ points += 4;
- if (num_bytes >> 4 & 1) {
- xmm9 = _mm_load_ps((float*)points);
+ xmm6 = _mm256_mul_ps(xmm4, xmm4);
- xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
+ xmm4 = _mm256_hadd_ps(xmm6, xmm6);
+ xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
- points += 2;
+ xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
- xmm9 = _mm_mul_ps(xmm10, xmm10);
+ xmm9 = _mm256_extractf128_ps(xmm_result, 1);
+ _mm_store_ps(target, xmm9);
+ target += 4;
+ }
- xmm10 = _mm_hadd_ps(xmm9, xmm9);
+ if (num_bytes >> 4 & 1) {
+ xmm9 = _mm_load_ps((float*)points);
- xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
+ xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
- _mm_storeh_pi((__m64*)target, xmm10);
- target += 2;
- }
+ points += 2;
- calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
+ xmm9 = _mm_mul_ps(xmm10, xmm10);
+
+ xmm10 = _mm_hadd_ps(xmm9, xmm9);
+
+ xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
+
+ _mm_storeh_pi((__m64*)target, xmm10);
+ target += 2;
+ }
+
+ calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
}
#endif /*LV_HAVE_AVX2*/
#include <volk/volk_avx_intrinsics.h>
static inline void
-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float *target, lv_32fc_t *src0,
- lv_32fc_t *points, float scalar,
- unsigned int num_points) {
- const int eightsPoints = num_points / 8;
- const int remainder = num_points - 8 * eightsPoints;
-
- __m256 xmm_points0, xmm_points1, xmm_result;
-
- // load complex value into all parts of the register.
- const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
-
- // Load scalar into all 8 parts of the register
- const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
-
- for(int i = 0; i < eightsPoints; ++i){
- xmm_points0 = _mm256_load_ps((float*)points);
- xmm_points1 = _mm256_load_ps((float*)(points + 4));
- points += 8;
-
- xmm_result = _mm256_scaled_norm_dist_ps(xmm_symbol, xmm_symbol, xmm_points0,
- xmm_points1, xmm_scalar);
-
- _mm256_store_ps(target, xmm_result);
- target += 8;
- }
-
- const lv_32fc_t symbol = *src0;
- calculate_scaled_distances(target, symbol, points, scalar, remainder);
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float* target,
+ lv_32fc_t* src0,
+ lv_32fc_t* points,
+ float scalar,
+ unsigned int num_points)
+{
+ const int eightsPoints = num_points / 8;
+ const int remainder = num_points - 8 * eightsPoints;
+
+ __m256 xmm_points0, xmm_points1, xmm_result;
+
+ // load complex value into all parts of the register.
+ const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
+
+ // Load scalar into all 8 parts of the register
+ const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
+
+ for (int i = 0; i < eightsPoints; ++i) {
+ xmm_points0 = _mm256_load_ps((float*)points);
+ xmm_points1 = _mm256_load_ps((float*)(points + 4));
+ points += 8;
+
+ xmm_result = _mm256_scaled_norm_dist_ps(
+ xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
+
+ _mm256_store_ps(target, xmm_result);
+ target += 8;
+ }
+
+ const lv_32fc_t symbol = *src0;
+ calculate_scaled_distances(target, symbol, points, scalar, remainder);
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE3
-#include<pmmintrin.h>
-#include<volk/volk_sse3_intrinsics.h>
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
static inline void
-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, lv_32fc_t* src0,
- lv_32fc_t* points, float scalar,
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target,
+ lv_32fc_t* src0,
+ lv_32fc_t* points,
+ float scalar,
unsigned int num_points)
{
- __m128 xmm_points0, xmm_points1, xmm_result;
-
- /*
- * First do 4 values in every loop iteration.
- * There may be up to 3 values left.
- * leftovers0 indicates if at least 2 more are available for SSE execution.
- * leftovers1 indicates if there is a single element left.
- */
- const int quarterPoints = num_points / 4;
- const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
- const int leftovers1 = num_points % 2;
-
- // load complex value into both parts of the register.
- const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
-
- // Load scalar into all 4 parts of the register
- const __m128 xmm_scalar = _mm_load1_ps(&scalar);
-
- for(int i = 0; i < quarterPoints; ++i) {
- xmm_points0 = _mm_load_ps((float*)points);
- xmm_points1 = _mm_load_ps((float*)(points + 2));
- points += 4;
- __VOLK_PREFETCH(points);
- // calculate distances
- xmm_result = _mm_scaled_norm_dist_ps_sse3(xmm_symbol, xmm_symbol, xmm_points0,
- xmm_points1, xmm_scalar);
-
- _mm_store_ps(target, xmm_result);
- target += 4;
- }
-
- for(int i = 0; i < leftovers0; ++i) {
- xmm_points0 = _mm_load_ps((float*)points);
- points += 2;
-
- xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
- xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
- xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
- xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
-
- _mm_storeh_pi((__m64*)target, xmm_result);
- target += 2;
- }
-
- calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
+ __m128 xmm_points0, xmm_points1, xmm_result;
+
+ /*
+ * First do 4 values in every loop iteration.
+ * There may be up to 3 values left.
+ * leftovers0 indicates if at least 2 more are available for SSE execution.
+ * leftovers1 indicates if there is a single element left.
+ */
+ const int quarterPoints = num_points / 4;
+ const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
+ const int leftovers1 = num_points % 2;
+
+ // load complex value into both parts of the register.
+ const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
+
+ // Load scalar into all 4 parts of the register
+ const __m128 xmm_scalar = _mm_load1_ps(&scalar);
+
+ for (int i = 0; i < quarterPoints; ++i) {
+ xmm_points0 = _mm_load_ps((float*)points);
+ xmm_points1 = _mm_load_ps((float*)(points + 2));
+ points += 4;
+ __VOLK_PREFETCH(points);
+ // calculate distances
+ xmm_result = _mm_scaled_norm_dist_ps_sse3(
+ xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
+
+ _mm_store_ps(target, xmm_result);
+ target += 4;
+ }
+
+ for (int i = 0; i < leftovers0; ++i) {
+ xmm_points0 = _mm_load_ps((float*)points);
+ points += 2;
+
+ xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
+ xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
+ xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
+ xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
+
+ _mm_storeh_pi((__m64*)target, xmm_result);
+ target += 2;
+ }
+
+ calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
}
#endif /*LV_HAVE_SSE3*/
#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
#include <volk/volk_sse_intrinsics.h>
+#include <xmmintrin.h>
static inline void
-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse(float* target, lv_32fc_t* src0,
- lv_32fc_t* points, float scalar,
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse(float* target,
+ lv_32fc_t* src0,
+ lv_32fc_t* points,
+ float scalar,
unsigned int num_points)
{
- const __m128 xmm_scalar = _mm_set1_ps(scalar);
- const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
-
- for (unsigned i = 0; i < num_points / 4; ++i) {
- __m128 xmm_points0 = _mm_load_ps((float *) points);
- __m128 xmm_points1 = _mm_load_ps((float *) (points + 2));
- points += 4;
- __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(xmm_symbol, xmm_symbol,
- xmm_points0, xmm_points1,
- xmm_scalar);
- _mm_store_ps((float *) target, xmm_result);
- target += 4;
- }
-
- calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
+ const __m128 xmm_scalar = _mm_set1_ps(scalar);
+ const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
+
+ for (unsigned i = 0; i < num_points / 4; ++i) {
+ __m128 xmm_points0 = _mm_load_ps((float*)points);
+ __m128 xmm_points1 = _mm_load_ps((float*)(points + 2));
+ points += 4;
+ __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(
+ xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
+ _mm_store_ps((float*)target, xmm_result);
+ target += 4;
+ }
+
+ calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
}
#endif // LV_HAVE_SSE
#ifdef LV_HAVE_GENERIC
static inline void
-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, lv_32fc_t* src0,
- lv_32fc_t* points, float scalar,
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target,
+ lv_32fc_t* src0,
+ lv_32fc_t* points,
+ float scalar,
unsigned int num_points)
{
- const lv_32fc_t symbol = *src0;
- calculate_scaled_distances(target, symbol, points, scalar, num_points);
+ const lv_32fc_t symbol = *src0;
+ calculate_scaled_distances(target, symbol, points, scalar, num_points);
}
#endif /*LV_HAVE_GENERIC*/
#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
-#include<volk/volk_complex.h>
+#include <volk/volk_complex.h>
#ifdef LV_HAVE_AVX2
-#include<immintrin.h>
+#include <immintrin.h>
#include <volk/volk_avx2_intrinsics.h>
static inline void
-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target, lv_32fc_t* src0,
- lv_32fc_t* points, float scalar,
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target,
+ lv_32fc_t* src0,
+ lv_32fc_t* points,
+ float scalar,
unsigned int num_points)
{
- const unsigned int num_bytes = num_points*8;
- __m128 xmm9, xmm10;
- __m256 xmm4, xmm6;
- __m256 xmm_points0, xmm_points1, xmm_result;
+ const unsigned int num_bytes = num_points * 8;
+ __m128 xmm9, xmm10;
+ __m256 xmm4, xmm6;
+ __m256 xmm_points0, xmm_points1, xmm_result;
+
+ const unsigned int bound = num_bytes >> 6;
+
+ // load complex value into all parts of the register.
+ const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
+ const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
+
+ // Load scalar into all 8 parts of the register
+ const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
+ const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
- const unsigned int bound = num_bytes >> 6;
-
- // load complex value into all parts of the register.
- const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
- const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
-
- // Load scalar into all 8 parts of the register
- const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
- const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
+ // Set permutation constant
+ const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
- // Set permutation constant
- const __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
-
- for(unsigned int i = 0; i < bound; ++i) {
- xmm_points0 = _mm256_loadu_ps((float*)points);
- xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
- points += 8;
- __VOLK_PREFETCH(points);
+ for (unsigned int i = 0; i < bound; ++i) {
+ xmm_points0 = _mm256_loadu_ps((float*)points);
+ xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
+ points += 8;
+ __VOLK_PREFETCH(points);
- xmm_result = _mm256_scaled_norm_dist_ps_avx2(xmm_symbol, xmm_symbol,
- xmm_points0, xmm_points1,
- xmm_scalar);
-
- _mm256_storeu_ps(target, xmm_result);
- target += 8;
- }
+ xmm_result = _mm256_scaled_norm_dist_ps_avx2(
+ xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
- if (num_bytes >> 5 & 1) {
- xmm_points0 = _mm256_loadu_ps((float*)points);
+ _mm256_storeu_ps(target, xmm_result);
+ target += 8;
+ }
- xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
+ if (num_bytes >> 5 & 1) {
+ xmm_points0 = _mm256_loadu_ps((float*)points);
- points += 4;
+ xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
- xmm6 = _mm256_mul_ps(xmm4, xmm4);
+ points += 4;
- xmm4 = _mm256_hadd_ps(xmm6, xmm6);
- xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+ xmm6 = _mm256_mul_ps(xmm4, xmm4);
- xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
+ xmm4 = _mm256_hadd_ps(xmm6, xmm6);
+ xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
- xmm9 = _mm256_extractf128_ps(xmm_result, 1);
- _mm_storeu_ps(target,xmm9);
- target += 4;
- }
+ xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
- if (num_bytes >> 4 & 1) {
- xmm9 = _mm_loadu_ps((float*)points);
+ xmm9 = _mm256_extractf128_ps(xmm_result, 1);
+ _mm_storeu_ps(target, xmm9);
+ target += 4;
+ }
- xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
+ if (num_bytes >> 4 & 1) {
+ xmm9 = _mm_loadu_ps((float*)points);
- points += 2;
+ xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
- xmm9 = _mm_mul_ps(xmm10, xmm10);
+ points += 2;
- xmm10 = _mm_hadd_ps(xmm9, xmm9);
+ xmm9 = _mm_mul_ps(xmm10, xmm10);
- xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
+ xmm10 = _mm_hadd_ps(xmm9, xmm9);
- _mm_storeh_pi((__m64*)target, xmm10);
- target += 2;
- }
+ xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
- calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
+ _mm_storeh_pi((__m64*)target, xmm10);
+ target += 2;
+ }
+
+ calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
}
#endif /*LV_HAVE_AVX2*/
#include <volk/volk_avx_intrinsics.h>
static inline void
-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx(float *target, lv_32fc_t *src0,
- lv_32fc_t *points, float scalar,
- unsigned int num_points) {
- const int eightsPoints = num_points / 8;
- const int remainder = num_points - 8 * eightsPoints;
-
- __m256 xmm_points0, xmm_points1, xmm_result;
-
- // load complex value into all parts of the register.
- const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
-
- // Load scalar into all 8 parts of the register
- const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
-
- for(int i = 0; i < eightsPoints; ++i){
- xmm_points0 = _mm256_loadu_ps((float*)points);
- xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
- points += 8;
-
- xmm_result = _mm256_scaled_norm_dist_ps(xmm_symbol, xmm_symbol, xmm_points0,
- xmm_points1, xmm_scalar);
-
- _mm256_storeu_ps(target, xmm_result);
- target += 8;
- }
-
- const lv_32fc_t symbol = *src0;
- calculate_scaled_distances(target, symbol, points, scalar, remainder);
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx(float* target,
+ lv_32fc_t* src0,
+ lv_32fc_t* points,
+ float scalar,
+ unsigned int num_points)
+{
+ const int eightsPoints = num_points / 8;
+ const int remainder = num_points - 8 * eightsPoints;
+
+ __m256 xmm_points0, xmm_points1, xmm_result;
+
+ // load complex value into all parts of the register.
+ const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
+
+ // Load scalar into all 8 parts of the register
+ const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
+
+ for (int i = 0; i < eightsPoints; ++i) {
+ xmm_points0 = _mm256_loadu_ps((float*)points);
+ xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
+ points += 8;
+
+ xmm_result = _mm256_scaled_norm_dist_ps(
+ xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
+
+ _mm256_storeu_ps(target, xmm_result);
+ target += 8;
+ }
+
+ const lv_32fc_t symbol = *src0;
+ calculate_scaled_distances(target, symbol, points, scalar, remainder);
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE3
-#include<pmmintrin.h>
-#include<volk/volk_sse3_intrinsics.h>
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
static inline void
-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3(float* target, lv_32fc_t* src0,
- lv_32fc_t* points, float scalar,
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3(float* target,
+ lv_32fc_t* src0,
+ lv_32fc_t* points,
+ float scalar,
unsigned int num_points)
{
- __m128 xmm_points0, xmm_points1, xmm_result;
-
- /*
- * First do 4 values in every loop iteration.
- * There may be up to 3 values left.
- * leftovers0 indicates if at least 2 more are available for SSE execution.
- * leftovers1 indicates if there is a single element left.
- */
- const int quarterPoints = num_points / 4;
- const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
- const int leftovers1 = num_points % 2;
-
- // load complex value into both parts of the register.
- const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
-
- // Load scalar into all 4 parts of the register
- const __m128 xmm_scalar = _mm_load1_ps(&scalar);
-
- for(int i = 0; i < quarterPoints; ++i) {
- xmm_points0 = _mm_loadu_ps((float*)points);
- xmm_points1 = _mm_loadu_ps((float*)(points + 2));
- points += 4;
- __VOLK_PREFETCH(points);
- // calculate distances
- xmm_result = _mm_scaled_norm_dist_ps_sse3(xmm_symbol, xmm_symbol, xmm_points0,
- xmm_points1, xmm_scalar);
-
- _mm_storeu_ps(target, xmm_result);
- target += 4;
- }
-
- for(int i = 0; i < leftovers0; ++i) {
- xmm_points0 = _mm_loadu_ps((float*)points);
- points += 2;
-
- xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
- xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
- xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
- xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
-
- _mm_storeh_pi((__m64*)target, xmm_result);
- target += 2;
- }
-
- calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
+ __m128 xmm_points0, xmm_points1, xmm_result;
+
+ /*
+ * First do 4 values in every loop iteration.
+ * There may be up to 3 values left.
+ * leftovers0 indicates if at least 2 more are available for SSE execution.
+ * leftovers1 indicates if there is a single element left.
+ */
+ const int quarterPoints = num_points / 4;
+ const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
+ const int leftovers1 = num_points % 2;
+
+ // load complex value into both parts of the register.
+ const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
+
+ // Load scalar into all 4 parts of the register
+ const __m128 xmm_scalar = _mm_load1_ps(&scalar);
+
+ for (int i = 0; i < quarterPoints; ++i) {
+ xmm_points0 = _mm_loadu_ps((float*)points);
+ xmm_points1 = _mm_loadu_ps((float*)(points + 2));
+ points += 4;
+ __VOLK_PREFETCH(points);
+ // calculate distances
+ xmm_result = _mm_scaled_norm_dist_ps_sse3(
+ xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
+
+ _mm_storeu_ps(target, xmm_result);
+ target += 4;
+ }
+
+ for (int i = 0; i < leftovers0; ++i) {
+ xmm_points0 = _mm_loadu_ps((float*)points);
+ points += 2;
+
+ xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
+ xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
+ xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
+ xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
+
+ _mm_storeh_pi((__m64*)target, xmm_result);
+ target += 2;
+ }
+
+ calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
}
#endif /*LV_HAVE_SSE3*/
#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
#include <volk/volk_sse_intrinsics.h>
+#include <xmmintrin.h>
static inline void
-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float* target, lv_32fc_t* src0,
- lv_32fc_t* points, float scalar,
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float* target,
+ lv_32fc_t* src0,
+ lv_32fc_t* points,
+ float scalar,
unsigned int num_points)
{
- const __m128 xmm_scalar = _mm_set1_ps(scalar);
- const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
-
- for (unsigned i = 0; i < num_points / 4; ++i) {
- __m128 xmm_points0 = _mm_loadu_ps((float *) points);
- __m128 xmm_points1 = _mm_loadu_ps((float *) (points + 2));
- points += 4;
- __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(xmm_symbol, xmm_symbol,
- xmm_points0, xmm_points1,
- xmm_scalar);
- _mm_storeu_ps((float *) target, xmm_result);
- target += 4;
- }
-
- calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
+ const __m128 xmm_scalar = _mm_set1_ps(scalar);
+ const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
+
+ for (unsigned i = 0; i < num_points / 4; ++i) {
+ __m128 xmm_points0 = _mm_loadu_ps((float*)points);
+ __m128 xmm_points1 = _mm_loadu_ps((float*)(points + 2));
+ points += 4;
+ __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(
+ xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
+ _mm_storeu_ps((float*)target, xmm_result);
+ target += 4;
+ }
+
+ calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
}
#endif // LV_HAVE_SSE
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points);
- * \endcode
+ * void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc(lv_32fc_t* cVector, const
+ * lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int
+ * num_points); \endcode
*
* \b Inputs
* \li aVector: The input vector to be added.
* \li bVector: The input vector to be conjugate and multiplied.
* \li scalar: The complex scalar to multiply against conjugated bVector.
- * \li num_points: The number of complex values in aVector and bVector to be conjugate, multiplied and stored into cVector.
+ * \li num_points: The number of complex values in aVector and bVector to be conjugate,
+ * multiplied and stored into cVector.
*
* \b Outputs
* \li cVector: The vector where the results will be stored.
#ifndef INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
#define INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
+#include <float.h>
#include <inttypes.h>
#include <stdio.h>
#include <volk/volk_complex.h>
-#include <float.h>
#ifdef LV_HAVE_GENERIC
-static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points){
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ const lv_32fc_t scalar,
+ unsigned int num_points)
+{
const lv_32fc_t* aPtr = aVector;
const lv_32fc_t* bPtr = bVector;
lv_32fc_t* cPtr = cVector;
#include <immintrin.h>
#include <volk/volk_avx_intrinsics.h>
-static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points) {
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ const lv_32fc_t scalar,
+ unsigned int num_points)
+{
unsigned int number = 0;
unsigned int i = 0;
const unsigned int quarterPoints = num_points / 4;
unsigned int isodd = num_points & 3;
__m256 x, y, s, z;
- lv_32fc_t v_scalar[4] = {scalar, scalar, scalar, scalar};
+ lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
const lv_32fc_t* a = aVector;
const lv_32fc_t* b = bVector;
// Set up constant scalar vector
s = _mm256_loadu_ps((float*)v_scalar);
- for(;number < quarterPoints; number++) {
+ for (; number < quarterPoints; number++) {
x = _mm256_loadu_ps((float*)b);
y = _mm256_loadu_ps((float*)a);
z = _mm256_complexconjugatemul_ps(s, x);
z = _mm256_add_ps(y, z);
- _mm256_storeu_ps((float*)c,z);
+ _mm256_storeu_ps((float*)c, z);
a += 4;
b += 4;
c += 4;
}
- for(i = num_points-isodd; i < num_points; i++) {
+ for (i = num_points - isodd; i < num_points; i++) {
*c++ = (*a++) + lv_conj(*b++) * scalar;
}
}
#include <pmmintrin.h>
#include <volk/volk_sse3_intrinsics.h>
-static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points) {
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ const lv_32fc_t scalar,
+ unsigned int num_points)
+{
unsigned int number = 0;
const unsigned int halfPoints = num_points / 2;
__m128 x, y, s, z;
- lv_32fc_t v_scalar[2] = {scalar, scalar};
+ lv_32fc_t v_scalar[2] = { scalar, scalar };
const lv_32fc_t* a = aVector;
const lv_32fc_t* b = bVector;
// Set up constant scalar vector
s = _mm_loadu_ps((float*)v_scalar);
- for(;number < halfPoints; number++){
+ for (; number < halfPoints; number++) {
x = _mm_loadu_ps((float*)b);
y = _mm_loadu_ps((float*)a);
z = _mm_complexconjugatemul_ps(s, x);
z = _mm_add_ps(y, z);
- _mm_storeu_ps((float*)c,z);
+ _mm_storeu_ps((float*)c, z);
a += 2;
b += 2;
c += 2;
}
- if((num_points % 2) != 0) {
+ if ((num_points % 2) != 0) {
*c = *a + lv_conj(*b) * scalar;
}
}
#include <immintrin.h>
#include <volk/volk_avx_intrinsics.h>
-static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points) {
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ const lv_32fc_t scalar,
+ unsigned int num_points)
+{
unsigned int number = 0;
unsigned int i = 0;
const unsigned int quarterPoints = num_points / 4;
unsigned int isodd = num_points & 3;
__m256 x, y, s, z;
- lv_32fc_t v_scalar[4] = {scalar, scalar, scalar, scalar};
+ lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
const lv_32fc_t* a = aVector;
const lv_32fc_t* b = bVector;
// Set up constant scalar vector
s = _mm256_load_ps((float*)v_scalar);
- for(;number < quarterPoints; number++) {
+ for (; number < quarterPoints; number++) {
x = _mm256_load_ps((float*)b);
y = _mm256_load_ps((float*)a);
z = _mm256_complexconjugatemul_ps(s, x);
z = _mm256_add_ps(y, z);
- _mm256_store_ps((float*)c,z);
+ _mm256_store_ps((float*)c, z);
a += 4;
b += 4;
c += 4;
}
- for(i = num_points-isodd; i < num_points; i++) {
+ for (i = num_points - isodd; i < num_points; i++) {
*c++ = (*a++) + lv_conj(*b++) * scalar;
}
}
#include <pmmintrin.h>
#include <volk/volk_sse3_intrinsics.h>
-static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points) {
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ const lv_32fc_t scalar,
+ unsigned int num_points)
+{
unsigned int number = 0;
const unsigned int halfPoints = num_points / 2;
__m128 x, y, s, z;
- lv_32fc_t v_scalar[2] = {scalar, scalar};
+ lv_32fc_t v_scalar[2] = { scalar, scalar };
const lv_32fc_t* a = aVector;
const lv_32fc_t* b = bVector;
// Set up constant scalar vector
s = _mm_load_ps((float*)v_scalar);
- for(;number < halfPoints; number++){
+ for (; number < halfPoints; number++) {
x = _mm_load_ps((float*)b);
y = _mm_load_ps((float*)a);
z = _mm_complexconjugatemul_ps(s, x);
z = _mm_add_ps(y, z);
- _mm_store_ps((float*)c,z);
+ _mm_store_ps((float*)c, z);
a += 2;
b += 2;
c += 2;
}
- if((num_points % 2) != 0) {
+ if ((num_points % 2) != 0) {
*c = *a + lv_conj(*b) * scalar;
}
}
#ifdef LV_HAVE_NEON
-#include <arm_neon.h>
-
-static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points){
+#include <arm_neon.h>
+
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ const lv_32fc_t scalar,
+ unsigned int num_points)
+{
const lv_32fc_t* bPtr = bVector;
const lv_32fc_t* aPtr = aVector;
lv_32fc_t* cPtr = cVector;
scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar);
scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1);
- for(number = 0; number < quarter_points; ++number) {
+ for (number = 0; number < quarter_points; ++number) {
a_val = vld2q_f32((float*)aPtr);
b_val = vld2q_f32((float*)bPtr);
b_val.val[1] = vnegq_f32(b_val.val[1]);
cPtr += 4;
}
- for(number = quarter_points*4; number < num_points; number++){
+ for (number = quarter_points * 4; number < num_points; number++) {
*cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
}
}
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_x2_square_dist_32f(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) {
- * \endcode
+ * void volk_32fc_x2_square_dist_32f(float* target, lv_32fc_t* src0, lv_32fc_t* points,
+ * unsigned int num_points) { \endcode
*
* \b Inputs
* \li src0: The complex input. Only the first point is used.
#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
#define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
-#include<inttypes.h>
-#include<stdio.h>
-#include<volk/volk_complex.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
#ifdef LV_HAVE_AVX2
-#include<immintrin.h>
+#include <immintrin.h>
-static inline void
-volk_32fc_x2_square_dist_32f_a_avx2(float* target, lv_32fc_t* src0, lv_32fc_t* points,
- unsigned int num_points)
+static inline void volk_32fc_x2_square_dist_32f_a_avx2(float* target,
+ lv_32fc_t* src0,
+ lv_32fc_t* points,
+ unsigned int num_points)
{
- const unsigned int num_bytes = num_points*8;
- __m128 xmm0, xmm9, xmm10;
- __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-
- lv_32fc_t diff;
- float sq_dist;
- int bound = num_bytes >> 6;
- int leftovers0 = (num_bytes >> 5) & 1;
- int leftovers1 = (num_bytes >> 4) & 1;
- int leftovers2 = (num_bytes >> 3) & 1;
- int i = 0;
-
- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
- xmm1 = _mm256_setzero_ps();
- xmm2 = _mm256_load_ps((float*)&points[0]);
- xmm0 = _mm_load_ps((float*)src0);
- xmm0 = _mm_permute_ps(xmm0, 0b01000100);
- xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
- xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
- xmm3 = _mm256_load_ps((float*)&points[4]);
-
- for(; i < bound; ++i) {
- xmm4 = _mm256_sub_ps(xmm1, xmm2);
- xmm5 = _mm256_sub_ps(xmm1, xmm3);
- points += 8;
- xmm6 = _mm256_mul_ps(xmm4, xmm4);
- xmm7 = _mm256_mul_ps(xmm5, xmm5);
-
+ const unsigned int num_bytes = num_points * 8;
+ __m128 xmm0, xmm9, xmm10;
+ __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+ lv_32fc_t diff;
+ float sq_dist;
+ int bound = num_bytes >> 6;
+ int leftovers0 = (num_bytes >> 5) & 1;
+ int leftovers1 = (num_bytes >> 4) & 1;
+ int leftovers2 = (num_bytes >> 3) & 1;
+ int i = 0;
+
+ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+ xmm1 = _mm256_setzero_ps();
xmm2 = _mm256_load_ps((float*)&points[0]);
+ xmm0 = _mm_load_ps((float*)src0);
+ xmm0 = _mm_permute_ps(xmm0, 0b01000100);
+ xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
+ xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
+ xmm3 = _mm256_load_ps((float*)&points[4]);
- xmm4 = _mm256_hadd_ps(xmm6, xmm7);
- xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+ for (; i < bound; ++i) {
+ xmm4 = _mm256_sub_ps(xmm1, xmm2);
+ xmm5 = _mm256_sub_ps(xmm1, xmm3);
+ points += 8;
+ xmm6 = _mm256_mul_ps(xmm4, xmm4);
+ xmm7 = _mm256_mul_ps(xmm5, xmm5);
- xmm3 = _mm256_load_ps((float*)&points[4]);
+ xmm2 = _mm256_load_ps((float*)&points[0]);
- _mm256_store_ps(target, xmm4);
+ xmm4 = _mm256_hadd_ps(xmm6, xmm7);
+ xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
- target += 8;
- }
+ xmm3 = _mm256_load_ps((float*)&points[4]);
- for(i = 0; i < leftovers0; ++i) {
+ _mm256_store_ps(target, xmm4);
- xmm2 = _mm256_load_ps((float*)&points[0]);
+ target += 8;
+ }
- xmm4 = _mm256_sub_ps(xmm1, xmm2);
+ for (i = 0; i < leftovers0; ++i) {
- points += 4;
+ xmm2 = _mm256_load_ps((float*)&points[0]);
- xmm6 = _mm256_mul_ps(xmm4, xmm4);
+ xmm4 = _mm256_sub_ps(xmm1, xmm2);
- xmm4 = _mm256_hadd_ps(xmm6, xmm6);
- xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+ points += 4;
- xmm9 = _mm256_extractf128_ps(xmm4, 1);
- _mm_store_ps(target,xmm9);
+ xmm6 = _mm256_mul_ps(xmm4, xmm4);
- target += 4;
- }
+ xmm4 = _mm256_hadd_ps(xmm6, xmm6);
+ xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+
+ xmm9 = _mm256_extractf128_ps(xmm4, 1);
+ _mm_store_ps(target, xmm9);
- for(i = 0; i < leftovers1; ++i) {
- xmm9 = _mm_load_ps((float*)&points[0]);
+ target += 4;
+ }
- xmm10 = _mm_sub_ps(xmm0, xmm9);
+ for (i = 0; i < leftovers1; ++i) {
+ xmm9 = _mm_load_ps((float*)&points[0]);
- points += 2;
+ xmm10 = _mm_sub_ps(xmm0, xmm9);
- xmm9 = _mm_mul_ps(xmm10, xmm10);
+ points += 2;
- xmm10 = _mm_hadd_ps(xmm9, xmm9);
+ xmm9 = _mm_mul_ps(xmm10, xmm10);
- _mm_storeh_pi((__m64*)target, xmm10);
+ xmm10 = _mm_hadd_ps(xmm9, xmm9);
- target += 2;
- }
+ _mm_storeh_pi((__m64*)target, xmm10);
- for(i = 0; i < leftovers2; ++i) {
+ target += 2;
+ }
- diff = src0[0] - points[0];
+ for (i = 0; i < leftovers2; ++i) {
- sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+ diff = src0[0] - points[0];
- target[0] = sq_dist;
- }
+ sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+
+ target[0] = sq_dist;
+ }
}
#endif /*LV_HAVE_AVX2*/
#ifdef LV_HAVE_SSE3
-#include<xmmintrin.h>
-#include<pmmintrin.h>
+#include <pmmintrin.h>
+#include <xmmintrin.h>
-static inline void
-volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points,
- unsigned int num_points)
+static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target,
+ lv_32fc_t* src0,
+ lv_32fc_t* points,
+ unsigned int num_points)
{
- const unsigned int num_bytes = num_points*8;
+ const unsigned int num_bytes = num_points * 8;
- __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+ __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
- lv_32fc_t diff;
- float sq_dist;
- int bound = num_bytes >> 5;
- int i = 0;
+ lv_32fc_t diff;
+ float sq_dist;
+ int bound = num_bytes >> 5;
+ int i = 0;
- xmm1 = _mm_setzero_ps();
- xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
- xmm2 = _mm_load_ps((float*)&points[0]);
- xmm1 = _mm_movelh_ps(xmm1, xmm1);
- xmm3 = _mm_load_ps((float*)&points[2]);
+ xmm1 = _mm_setzero_ps();
+ xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
+ xmm2 = _mm_load_ps((float*)&points[0]);
+ xmm1 = _mm_movelh_ps(xmm1, xmm1);
+ xmm3 = _mm_load_ps((float*)&points[2]);
+
+ for (; i < bound - 1; ++i) {
+ xmm4 = _mm_sub_ps(xmm1, xmm2);
+ xmm5 = _mm_sub_ps(xmm1, xmm3);
+ points += 4;
+ xmm6 = _mm_mul_ps(xmm4, xmm4);
+ xmm7 = _mm_mul_ps(xmm5, xmm5);
+
+ xmm2 = _mm_load_ps((float*)&points[0]);
+
+ xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+ xmm3 = _mm_load_ps((float*)&points[2]);
+
+ _mm_store_ps(target, xmm4);
+
+ target += 4;
+ }
- for(; i < bound - 1; ++i) {
xmm4 = _mm_sub_ps(xmm1, xmm2);
xmm5 = _mm_sub_ps(xmm1, xmm3);
+
points += 4;
xmm6 = _mm_mul_ps(xmm4, xmm4);
xmm7 = _mm_mul_ps(xmm5, xmm5);
- xmm2 = _mm_load_ps((float*)&points[0]);
-
xmm4 = _mm_hadd_ps(xmm6, xmm7);
- xmm3 = _mm_load_ps((float*)&points[2]);
-
_mm_store_ps(target, xmm4);
target += 4;
- }
-
- xmm4 = _mm_sub_ps(xmm1, xmm2);
- xmm5 = _mm_sub_ps(xmm1, xmm3);
-
- points += 4;
- xmm6 = _mm_mul_ps(xmm4, xmm4);
- xmm7 = _mm_mul_ps(xmm5, xmm5);
- xmm4 = _mm_hadd_ps(xmm6, xmm7);
+ if (num_bytes >> 4 & 1) {
- _mm_store_ps(target, xmm4);
+ xmm2 = _mm_load_ps((float*)&points[0]);
- target += 4;
+ xmm4 = _mm_sub_ps(xmm1, xmm2);
- if (num_bytes >> 4 & 1) {
+ points += 2;
- xmm2 = _mm_load_ps((float*)&points[0]);
-
- xmm4 = _mm_sub_ps(xmm1, xmm2);
+ xmm6 = _mm_mul_ps(xmm4, xmm4);
- points += 2;
-
- xmm6 = _mm_mul_ps(xmm4, xmm4);
+ xmm4 = _mm_hadd_ps(xmm6, xmm6);
- xmm4 = _mm_hadd_ps(xmm6, xmm6);
+ _mm_storeh_pi((__m64*)target, xmm4);
- _mm_storeh_pi((__m64*)target, xmm4);
+ target += 2;
+ }
- target += 2;
- }
+ if (num_bytes >> 3 & 1) {
- if (num_bytes >> 3 & 1) {
+ diff = src0[0] - points[0];
- diff = src0[0] - points[0];
+ sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
- sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
-
- target[0] = sq_dist;
- }
+ target[0] = sq_dist;
+ }
}
#endif /*LV_HAVE_SSE3*/
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_32fc_x2_square_dist_32f_neon(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points)
+static inline void volk_32fc_x2_square_dist_32f_neon(float* target,
+ lv_32fc_t* src0,
+ lv_32fc_t* points,
+ unsigned int num_points)
{
- const unsigned int quarter_points = num_points / 4;
- unsigned int number;
-
- float32x4x2_t a_vec, b_vec;
- float32x4x2_t diff_vec;
- float32x4_t tmp, tmp1, dist_sq;
- a_vec.val[0] = vdupq_n_f32( lv_creal(src0[0]) );
- a_vec.val[1] = vdupq_n_f32( lv_cimag(src0[0]) );
- for(number=0; number < quarter_points; ++number) {
- b_vec = vld2q_f32((float*)points);
- diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
- diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
- tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
- tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
-
- dist_sq = vaddq_f32(tmp, tmp1);
- vst1q_f32(target, dist_sq);
- points += 4;
- target += 4;
- }
- for(number=quarter_points*4; number < num_points; ++number) {
- lv_32fc_t diff = src0[0] - *points++;
- *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
- }
+ const unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+
+ float32x4x2_t a_vec, b_vec;
+ float32x4x2_t diff_vec;
+ float32x4_t tmp, tmp1, dist_sq;
+ a_vec.val[0] = vdupq_n_f32(lv_creal(src0[0]));
+ a_vec.val[1] = vdupq_n_f32(lv_cimag(src0[0]));
+ for (number = 0; number < quarter_points; ++number) {
+ b_vec = vld2q_f32((float*)points);
+ diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
+ diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
+ tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
+ tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
+
+ dist_sq = vaddq_f32(tmp, tmp1);
+ vst1q_f32(target, dist_sq);
+ points += 4;
+ target += 4;
+ }
+ for (number = quarter_points * 4; number < num_points; ++number) {
+ lv_32fc_t diff = src0[0] - *points++;
+ *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32fc_x2_square_dist_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points,
- unsigned int num_points)
+static inline void volk_32fc_x2_square_dist_32f_generic(float* target,
+ lv_32fc_t* src0,
+ lv_32fc_t* points,
+ unsigned int num_points)
{
- const unsigned int num_bytes = num_points*8;
+ const unsigned int num_bytes = num_points * 8;
- lv_32fc_t diff;
- float sq_dist;
- unsigned int i = 0;
+ lv_32fc_t diff;
+ float sq_dist;
+ unsigned int i = 0;
- for(; i < num_bytes >> 3; ++i) {
- diff = src0[0] - points[i];
+ for (; i<num_bytes>> 3; ++i) {
+ diff = src0[0] - points[i];
- sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+ sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
- target[i] = sq_dist;
- }
+ target[i] = sq_dist;
+ }
}
#endif /*LV_HAVE_GENERIC*/
#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_u_H
#define INCLUDED_volk_32fc_x2_square_dist_32f_u_H
-#include<inttypes.h>
-#include<stdio.h>
-#include<volk/volk_complex.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
#ifdef LV_HAVE_AVX2
-#include<immintrin.h>
+#include <immintrin.h>
-static inline void
-volk_32fc_x2_square_dist_32f_u_avx2(float* target, lv_32fc_t* src0, lv_32fc_t* points,
- unsigned int num_points)
+static inline void volk_32fc_x2_square_dist_32f_u_avx2(float* target,
+ lv_32fc_t* src0,
+ lv_32fc_t* points,
+ unsigned int num_points)
{
- const unsigned int num_bytes = num_points*8;
- __m128 xmm0, xmm9;
- __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-
- lv_32fc_t diff;
- float sq_dist;
- int bound = num_bytes >> 6;
- int leftovers1 = (num_bytes >> 3) & 0b11;
- int i = 0;
-
- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
- xmm1 = _mm256_setzero_ps();
- xmm0 = _mm_loadu_ps((float*)src0);
- xmm0 = _mm_permute_ps(xmm0, 0b01000100);
- xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
- xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
-
- for(; i < bound; ++i) {
+ const unsigned int num_bytes = num_points * 8;
+ __m128 xmm0, xmm9;
+ __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+ lv_32fc_t diff;
+ float sq_dist;
+ int bound = num_bytes >> 6;
+ int leftovers1 = (num_bytes >> 3) & 0b11;
+ int i = 0;
+
+ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+ xmm1 = _mm256_setzero_ps();
xmm2 = _mm256_loadu_ps((float*)&points[0]);
+ xmm0 = _mm_loadu_ps((float*)src0);
+ xmm0 = _mm_permute_ps(xmm0, 0b01000100);
+ xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
+ xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
xmm3 = _mm256_loadu_ps((float*)&points[4]);
- xmm4 = _mm256_sub_ps(xmm1, xmm2);
- xmm5 = _mm256_sub_ps(xmm1, xmm3);
- points += 8;
- xmm6 = _mm256_mul_ps(xmm4, xmm4);
- xmm7 = _mm256_mul_ps(xmm5, xmm5);
- xmm4 = _mm256_hadd_ps(xmm6, xmm7);
- xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+ for (; i < bound; ++i) {
+ xmm4 = _mm256_sub_ps(xmm1, xmm2);
+ xmm5 = _mm256_sub_ps(xmm1, xmm3);
+ points += 8;
+ xmm6 = _mm256_mul_ps(xmm4, xmm4);
+ xmm7 = _mm256_mul_ps(xmm5, xmm5);
- _mm256_storeu_ps(target, xmm4);
+ xmm2 = _mm256_loadu_ps((float*)&points[0]);
- target += 8;
- }
+ xmm4 = _mm256_hadd_ps(xmm6, xmm7);
+ xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
- if (num_bytes >> 5 & 1) {
+ xmm3 = _mm256_loadu_ps((float*)&points[4]);
- xmm2 = _mm256_loadu_ps((float*)&points[0]);
+ _mm256_storeu_ps(target, xmm4);
- xmm4 = _mm256_sub_ps(xmm1, xmm2);
+ target += 8;
+ }
- points += 4;
+ if (num_bytes >> 5 & 1) {
- xmm6 = _mm256_mul_ps(xmm4, xmm4);
+ xmm2 = _mm256_loadu_ps((float*)&points[0]);
- xmm4 = _mm256_hadd_ps(xmm6, xmm6);
- xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+ xmm4 = _mm256_sub_ps(xmm1, xmm2);
- xmm9 = _mm256_extractf128_ps(xmm4, 1);
- _mm_storeu_ps(target,xmm9);
+ points += 4;
- target += 4;
- }
+ xmm6 = _mm256_mul_ps(xmm4, xmm4);
+
+ xmm4 = _mm256_hadd_ps(xmm6, xmm6);
+ xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+
+ xmm9 = _mm256_extractf128_ps(xmm4, 1);
+ _mm_storeu_ps(target, xmm9);
+
+ target += 4;
+ }
- for(i = 0; i < leftovers1; ++i) {
+ for (i = 0; i < leftovers1; ++i) {
- diff = src0[0] - points[0];
- points += 1;
+ diff = src0[0] - points[0];
+ points += 1;
- sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+ sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
- target[0] = sq_dist;
- target += 1;
- }
+ target[0] = sq_dist;
+ target += 1;
+ }
}
#endif /*LV_HAVE_AVX2*/
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32i_s32f_convert_32f(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points)
- * \endcode
+ * void volk_32i_s32f_convert_32f(float* outputVector, const int32_t* inputVector, const
+ * float scalar, unsigned int num_points) \endcode
*
* \b Inputs
* \li inputVector: The vector of 32-bit integers.
#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void
-volk_32i_s32f_convert_32f_u_avx512f(float* outputVector, const int32_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32i_s32f_convert_32f_u_avx512f(float* outputVector,
+ const int32_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int onesixteenthPoints = num_points / 16;
+ unsigned int number = 0;
+ const unsigned int onesixteenthPoints = num_points / 16;
- float* outputVectorPtr = outputVector;
- const float iScalar = 1.0 / scalar;
- __m512 invScalar = _mm512_set1_ps(iScalar);
- int32_t* inputPtr = (int32_t*)inputVector;
- __m512i inputVal;
- __m512 ret;
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m512 invScalar = _mm512_set1_ps(iScalar);
+ int32_t* inputPtr = (int32_t*)inputVector;
+ __m512i inputVal;
+ __m512 ret;
- for(;number < onesixteenthPoints; number++){
- // Load the values
- inputVal = _mm512_loadu_si512((__m512i*)inputPtr);
+ for (; number < onesixteenthPoints; number++) {
+ // Load the values
+ inputVal = _mm512_loadu_si512((__m512i*)inputPtr);
- ret = _mm512_cvtepi32_ps(inputVal);
- ret = _mm512_mul_ps(ret, invScalar);
+ ret = _mm512_cvtepi32_ps(inputVal);
+ ret = _mm512_mul_ps(ret, invScalar);
- _mm512_storeu_ps(outputVectorPtr, ret);
+ _mm512_storeu_ps(outputVectorPtr, ret);
- outputVectorPtr += 16;
- inputPtr += 16;
- }
+ outputVectorPtr += 16;
+ inputPtr += 16;
+ }
- number = onesixteenthPoints * 16;
- for(; number < num_points; number++){
- outputVector[number] =((float)(inputVector[number])) * iScalar;
- }
+ number = onesixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ outputVector[number] = ((float)(inputVector[number])) * iScalar;
+ }
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_32i_s32f_convert_32f_u_avx2(float* outputVector, const int32_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32i_s32f_convert_32f_u_avx2(float* outputVector,
+ const int32_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int oneEightPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int oneEightPoints = num_points / 8;
- float* outputVectorPtr = outputVector;
- const float iScalar = 1.0 / scalar;
- __m256 invScalar = _mm256_set1_ps(iScalar);
- int32_t* inputPtr = (int32_t*)inputVector;
- __m256i inputVal;
- __m256 ret;
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m256 invScalar = _mm256_set1_ps(iScalar);
+ int32_t* inputPtr = (int32_t*)inputVector;
+ __m256i inputVal;
+ __m256 ret;
- for(;number < oneEightPoints; number++){
- // Load the 4 values
- inputVal = _mm256_loadu_si256((__m256i*)inputPtr);
+ for (; number < oneEightPoints; number++) {
+ // Load the 4 values
+ inputVal = _mm256_loadu_si256((__m256i*)inputPtr);
- ret = _mm256_cvtepi32_ps(inputVal);
- ret = _mm256_mul_ps(ret, invScalar);
+ ret = _mm256_cvtepi32_ps(inputVal);
+ ret = _mm256_mul_ps(ret, invScalar);
- _mm256_storeu_ps(outputVectorPtr, ret);
+ _mm256_storeu_ps(outputVectorPtr, ret);
- outputVectorPtr += 8;
- inputPtr += 8;
- }
+ outputVectorPtr += 8;
+ inputPtr += 8;
+ }
- number = oneEightPoints * 8;
- for(; number < num_points; number++){
- outputVector[number] =((float)(inputVector[number])) * iScalar;
- }
+ number = oneEightPoints * 8;
+ for (; number < num_points; number++) {
+ outputVector[number] = ((float)(inputVector[number])) * iScalar;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void
-volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const int32_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector,
+ const int32_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- float* outputVectorPtr = outputVector;
- const float iScalar = 1.0 / scalar;
- __m128 invScalar = _mm_set_ps1(iScalar);
- int32_t* inputPtr = (int32_t*)inputVector;
- __m128i inputVal;
- __m128 ret;
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ int32_t* inputPtr = (int32_t*)inputVector;
+ __m128i inputVal;
+ __m128 ret;
- for(;number < quarterPoints; number++){
- // Load the 4 values
- inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+ for (; number < quarterPoints; number++) {
+ // Load the 4 values
+ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
- ret = _mm_cvtepi32_ps(inputVal);
- ret = _mm_mul_ps(ret, invScalar);
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
- _mm_storeu_ps(outputVectorPtr, ret);
+ _mm_storeu_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
- inputPtr += 4;
- }
+ outputVectorPtr += 4;
+ inputPtr += 4;
+ }
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] =((float)(inputVector[number])) * iScalar;
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ outputVector[number] = ((float)(inputVector[number])) * iScalar;
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32i_s32f_convert_32f_generic(float* outputVector, const int32_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32i_s32f_convert_32f_generic(float* outputVector,
+ const int32_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- float* outputVectorPtr = outputVector;
- const int32_t* inputVectorPtr = inputVector;
- unsigned int number = 0;
- const float iScalar = 1.0 / scalar;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
- }
+ float* outputVectorPtr = outputVector;
+ const int32_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ const float iScalar = 1.0 / scalar;
+
+ for (number = 0; number < num_points; number++) {
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+ }
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_32i_s32f_convert_32f_u_H */
-
#ifndef INCLUDED_volk_32i_s32f_convert_32f_a_H
#define INCLUDED_volk_32i_s32f_convert_32f_a_H
#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void
-volk_32i_s32f_convert_32f_a_avx512f(float* outputVector, const int32_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32i_s32f_convert_32f_a_avx512f(float* outputVector,
+ const int32_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int onesixteenthPoints = num_points / 16;
+ unsigned int number = 0;
+ const unsigned int onesixteenthPoints = num_points / 16;
- float* outputVectorPtr = outputVector;
- const float iScalar = 1.0 / scalar;
- __m512 invScalar = _mm512_set1_ps(iScalar);
- int32_t* inputPtr = (int32_t*)inputVector;
- __m512i inputVal;
- __m512 ret;
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m512 invScalar = _mm512_set1_ps(iScalar);
+ int32_t* inputPtr = (int32_t*)inputVector;
+ __m512i inputVal;
+ __m512 ret;
- for(;number < onesixteenthPoints; number++){
- // Load the values
- inputVal = _mm512_load_si512((__m512i*)inputPtr);
+ for (; number < onesixteenthPoints; number++) {
+ // Load the values
+ inputVal = _mm512_load_si512((__m512i*)inputPtr);
- ret = _mm512_cvtepi32_ps(inputVal);
- ret = _mm512_mul_ps(ret, invScalar);
+ ret = _mm512_cvtepi32_ps(inputVal);
+ ret = _mm512_mul_ps(ret, invScalar);
- _mm512_store_ps(outputVectorPtr, ret);
+ _mm512_store_ps(outputVectorPtr, ret);
- outputVectorPtr += 16;
- inputPtr += 16;
- }
+ outputVectorPtr += 16;
+ inputPtr += 16;
+ }
- number = onesixteenthPoints * 16;
- for(; number < num_points; number++){
- outputVector[number] =((float)(inputVector[number])) * iScalar;
- }
+ number = onesixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ outputVector[number] = ((float)(inputVector[number])) * iScalar;
+ }
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_32i_s32f_convert_32f_a_avx2(float* outputVector, const int32_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32i_s32f_convert_32f_a_avx2(float* outputVector,
+ const int32_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int oneEightPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int oneEightPoints = num_points / 8;
- float* outputVectorPtr = outputVector;
- const float iScalar = 1.0 / scalar;
- __m256 invScalar = _mm256_set1_ps(iScalar);
- int32_t* inputPtr = (int32_t*)inputVector;
- __m256i inputVal;
- __m256 ret;
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m256 invScalar = _mm256_set1_ps(iScalar);
+ int32_t* inputPtr = (int32_t*)inputVector;
+ __m256i inputVal;
+ __m256 ret;
- for(;number < oneEightPoints; number++){
- // Load the 4 values
- inputVal = _mm256_load_si256((__m256i*)inputPtr);
+ for (; number < oneEightPoints; number++) {
+ // Load the 4 values
+ inputVal = _mm256_load_si256((__m256i*)inputPtr);
- ret = _mm256_cvtepi32_ps(inputVal);
- ret = _mm256_mul_ps(ret, invScalar);
+ ret = _mm256_cvtepi32_ps(inputVal);
+ ret = _mm256_mul_ps(ret, invScalar);
- _mm256_store_ps(outputVectorPtr, ret);
+ _mm256_store_ps(outputVectorPtr, ret);
- outputVectorPtr += 8;
- inputPtr += 8;
- }
+ outputVectorPtr += 8;
+ inputPtr += 8;
+ }
- number = oneEightPoints * 8;
- for(; number < num_points; number++){
- outputVector[number] =((float)(inputVector[number])) * iScalar;
- }
+ number = oneEightPoints * 8;
+ for (; number < num_points; number++) {
+ outputVector[number] = ((float)(inputVector[number])) * iScalar;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void
-volk_32i_s32f_convert_32f_a_sse2(float* outputVector, const int32_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector,
+ const int32_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- float* outputVectorPtr = outputVector;
- const float iScalar = 1.0 / scalar;
- __m128 invScalar = _mm_set_ps1(iScalar);
- int32_t* inputPtr = (int32_t*)inputVector;
- __m128i inputVal;
- __m128 ret;
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ int32_t* inputPtr = (int32_t*)inputVector;
+ __m128i inputVal;
+ __m128 ret;
- for(;number < quarterPoints; number++){
- // Load the 4 values
- inputVal = _mm_load_si128((__m128i*)inputPtr);
+ for (; number < quarterPoints; number++) {
+ // Load the 4 values
+ inputVal = _mm_load_si128((__m128i*)inputPtr);
- ret = _mm_cvtepi32_ps(inputVal);
- ret = _mm_mul_ps(ret, invScalar);
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
- _mm_store_ps(outputVectorPtr, ret);
+ _mm_store_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
- inputPtr += 4;
- }
+ outputVectorPtr += 4;
+ inputPtr += 4;
+ }
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] =((float)(inputVector[number])) * iScalar;
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ outputVector[number] = ((float)(inputVector[number])) * iScalar;
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32i_s32f_convert_32f_a_generic(float* outputVector, const int32_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_32i_s32f_convert_32f_a_generic(float* outputVector,
+ const int32_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- float* outputVectorPtr = outputVector;
- const int32_t* inputVectorPtr = inputVector;
- unsigned int number = 0;
- const float iScalar = 1.0 / scalar;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
- }
+ float* outputVectorPtr = outputVector;
+ const int32_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ const float iScalar = 1.0 / scalar;
+
+ for (number = 0; number < num_points; number++) {
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+ }
}
#endif /* LV_HAVE_GENERIC */
-
-
#endif /* INCLUDED_volk_32i_s32f_convert_32f_a_H */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32i_x2_and_32i(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points)
- * \endcode
+ * void volk_32i_x2_and_32i(int32_t* cVector, const int32_t* aVector, const int32_t*
+ * bVector, unsigned int num_points) \endcode
*
* \b Inputs
* \li aVector: Input vector of samples.
#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void
-volk_32i_x2_and_32i_a_avx512f(int32_t* cVector, const int32_t* aVector,
- const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_and_32i_a_avx512f(int32_t* cVector,
+ const int32_t* aVector,
+ const int32_t* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- int32_t* cPtr = (int32_t*)cVector;
- const int32_t* aPtr = (int32_t*)aVector;
- const int32_t* bPtr = (int32_t*)bVector;
+ int32_t* cPtr = (int32_t*)cVector;
+ const int32_t* aPtr = (int32_t*)aVector;
+ const int32_t* bPtr = (int32_t*)bVector;
- __m512i aVal, bVal, cVal;
- for(;number < sixteenthPoints; number++){
+ __m512i aVal, bVal, cVal;
+ for (; number < sixteenthPoints; number++) {
- aVal = _mm512_load_si512(aPtr);
- bVal = _mm512_load_si512(bPtr);
+ aVal = _mm512_load_si512(aPtr);
+ bVal = _mm512_load_si512(bPtr);
- cVal = _mm512_and_si512(aVal, bVal);
+ cVal = _mm512_and_si512(aVal, bVal);
- _mm512_store_si512(cPtr,cVal); // Store the results back into the C container
+ _mm512_store_si512(cPtr, cVal); // Store the results back into the C container
- aPtr += 16;
- bPtr += 16;
- cPtr += 16;
- }
+ aPtr += 16;
+ bPtr += 16;
+ cPtr += 16;
+ }
- number = sixteenthPoints * 16;
- for(;number < num_points; number++){
- cVector[number] = aVector[number] & bVector[number];
- }
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ cVector[number] = aVector[number] & bVector[number];
+ }
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_32i_x2_and_32i_a_avx2(int32_t* cVector, const int32_t* aVector,
- const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_and_32i_a_avx2(int32_t* cVector,
+ const int32_t* aVector,
+ const int32_t* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int oneEightPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int oneEightPoints = num_points / 8;
- int32_t* cPtr = cVector;
- const int32_t* aPtr = aVector;
- const int32_t* bPtr = bVector;
+ int32_t* cPtr = cVector;
+ const int32_t* aPtr = aVector;
+ const int32_t* bPtr = bVector;
- __m256i aVal, bVal, cVal;
- for(;number < oneEightPoints; number++){
+ __m256i aVal, bVal, cVal;
+ for (; number < oneEightPoints; number++) {
- aVal = _mm256_load_si256((__m256i*)aPtr);
- bVal = _mm256_load_si256((__m256i*)bPtr);
+ aVal = _mm256_load_si256((__m256i*)aPtr);
+ bVal = _mm256_load_si256((__m256i*)bPtr);
- cVal = _mm256_and_si256(aVal, bVal);
+ cVal = _mm256_and_si256(aVal, bVal);
- _mm256_store_si256((__m256i*)cPtr,cVal); // Store the results back into the C container
+ _mm256_store_si256((__m256i*)cPtr,
+ cVal); // Store the results back into the C container
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
- number = oneEightPoints * 8;
- for(;number < num_points; number++){
- cVector[number] = aVector[number] & bVector[number];
- }
+ number = oneEightPoints * 8;
+ for (; number < num_points; number++) {
+ cVector[number] = aVector[number] & bVector[number];
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32i_x2_and_32i_a_sse(int32_t* cVector, const int32_t* aVector,
- const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector,
+ const int32_t* aVector,
+ const int32_t* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- float* cPtr = (float*)cVector;
- const float* aPtr = (float*)aVector;
- const float* bPtr = (float*)bVector;
+ float* cPtr = (float*)cVector;
+ const float* aPtr = (float*)aVector;
+ const float* bPtr = (float*)bVector;
- __m128 aVal, bVal, cVal;
- for(;number < quarterPoints; number++){
+ __m128 aVal, bVal, cVal;
+ for (; number < quarterPoints; number++) {
- aVal = _mm_load_ps(aPtr);
- bVal = _mm_load_ps(bPtr);
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
- cVal = _mm_and_ps(aVal, bVal);
+ cVal = _mm_and_ps(aVal, bVal);
- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- cVector[number] = aVector[number] & bVector[number];
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ cVector[number] = aVector[number] & bVector[number];
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_32i_x2_and_32i_neon(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_and_32i_neon(int32_t* cVector,
+ const int32_t* aVector,
+ const int32_t* bVector,
+ unsigned int num_points)
{
- int32_t* cPtr = cVector;
- const int32_t* aPtr = aVector;
- const int32_t* bPtr= bVector;
- unsigned int number = 0;
- unsigned int quarter_points = num_points / 4;
-
- int32x4_t a_val, b_val, c_val;
-
- for(number = 0; number < quarter_points; number++){
- a_val = vld1q_s32(aPtr);
- b_val = vld1q_s32(bPtr);
- c_val = vandq_s32(a_val, b_val);
- vst1q_s32(cPtr, c_val);
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
-
- for(number = quarter_points * 4; number < num_points; number++){
- *cPtr++ = (*aPtr++) & (*bPtr++);
- }
+ int32_t* cPtr = cVector;
+ const int32_t* aPtr = aVector;
+ const int32_t* bPtr = bVector;
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+
+ int32x4_t a_val, b_val, c_val;
+
+ for (number = 0; number < quarter_points; number++) {
+ a_val = vld1q_s32(aPtr);
+ b_val = vld1q_s32(bPtr);
+ c_val = vandq_s32(a_val, b_val);
+ vst1q_s32(cPtr, c_val);
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ for (number = quarter_points * 4; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) & (*bPtr++);
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32i_x2_and_32i_generic(int32_t* cVector, const int32_t* aVector,
- const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_and_32i_generic(int32_t* cVector,
+ const int32_t* aVector,
+ const int32_t* bVector,
+ unsigned int num_points)
{
- int32_t* cPtr = cVector;
- const int32_t* aPtr = aVector;
- const int32_t* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++) & (*bPtr++);
- }
+ int32_t* cPtr = cVector;
+ const int32_t* aPtr = aVector;
+ const int32_t* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) & (*bPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC
-extern void
-volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector,
- const int32_t* bVector, unsigned int num_points);
-
-static inline void
-volk_32i_x2_and_32i_u_orc(int32_t* cVector, const int32_t* aVector,
- const int32_t* bVector, unsigned int num_points)
+extern void volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector,
+ const int32_t* aVector,
+ const int32_t* bVector,
+ unsigned int num_points);
+
+static inline void volk_32i_x2_and_32i_u_orc(int32_t* cVector,
+ const int32_t* aVector,
+ const int32_t* bVector,
+ unsigned int num_points)
{
- volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points);
+ volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_ORC */
#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void
-volk_32i_x2_and_32i_u_avx512f(int32_t* cVector, const int32_t* aVector,
- const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_and_32i_u_avx512f(int32_t* cVector,
+ const int32_t* aVector,
+ const int32_t* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- int32_t* cPtr = (int32_t*)cVector;
- const int32_t* aPtr = (int32_t*)aVector;
- const int32_t* bPtr = (int32_t*)bVector;
+ int32_t* cPtr = (int32_t*)cVector;
+ const int32_t* aPtr = (int32_t*)aVector;
+ const int32_t* bPtr = (int32_t*)bVector;
- __m512i aVal, bVal, cVal;
- for(;number < sixteenthPoints; number++){
+ __m512i aVal, bVal, cVal;
+ for (; number < sixteenthPoints; number++) {
- aVal = _mm512_loadu_si512(aPtr);
- bVal = _mm512_loadu_si512(bPtr);
+ aVal = _mm512_loadu_si512(aPtr);
+ bVal = _mm512_loadu_si512(bPtr);
- cVal = _mm512_and_si512(aVal, bVal);
+ cVal = _mm512_and_si512(aVal, bVal);
- _mm512_storeu_si512(cPtr,cVal); // Store the results back into the C container
+ _mm512_storeu_si512(cPtr, cVal); // Store the results back into the C container
- aPtr += 16;
- bPtr += 16;
- cPtr += 16;
- }
+ aPtr += 16;
+ bPtr += 16;
+ cPtr += 16;
+ }
- number = sixteenthPoints * 16;
- for(;number < num_points; number++){
- cVector[number] = aVector[number] & bVector[number];
- }
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ cVector[number] = aVector[number] & bVector[number];
+ }
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_32i_x2_and_32i_u_avx2(int32_t* cVector, const int32_t* aVector,
- const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_and_32i_u_avx2(int32_t* cVector,
+ const int32_t* aVector,
+ const int32_t* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int oneEightPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int oneEightPoints = num_points / 8;
- int32_t* cPtr = cVector;
- const int32_t* aPtr = aVector;
- const int32_t* bPtr = bVector;
+ int32_t* cPtr = cVector;
+ const int32_t* aPtr = aVector;
+ const int32_t* bPtr = bVector;
- __m256i aVal, bVal, cVal;
- for(;number < oneEightPoints; number++){
+ __m256i aVal, bVal, cVal;
+ for (; number < oneEightPoints; number++) {
- aVal = _mm256_loadu_si256((__m256i*)aPtr);
- bVal = _mm256_loadu_si256((__m256i*)bPtr);
+ aVal = _mm256_loadu_si256((__m256i*)aPtr);
+ bVal = _mm256_loadu_si256((__m256i*)bPtr);
- cVal = _mm256_and_si256(aVal, bVal);
+ cVal = _mm256_and_si256(aVal, bVal);
- _mm256_storeu_si256((__m256i*)cPtr,cVal); // Store the results back into the C container
+ _mm256_storeu_si256((__m256i*)cPtr,
+ cVal); // Store the results back into the C container
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
- number = oneEightPoints * 8;
- for(;number < num_points; number++){
- cVector[number] = aVector[number] & bVector[number];
- }
+ number = oneEightPoints * 8;
+ for (; number < num_points; number++) {
+ cVector[number] = aVector[number] & bVector[number];
+ }
}
#endif /* LV_HAVE_AVX2 */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32i_x2_or_32i(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points)
- * \endcode
+ * void volk_32i_x2_or_32i(int32_t* cVector, const int32_t* aVector, const int32_t*
+ * bVector, unsigned int num_points) \endcode
*
* \b Inputs
* \li aVector: Input vector of samples.
#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void
-volk_32i_x2_or_32i_a_avx512f(int32_t* cVector, const int32_t* aVector,
- const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_or_32i_a_avx512f(int32_t* cVector,
+ const int32_t* aVector,
+ const int32_t* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- int32_t* cPtr = (int32_t*)cVector;
- const int32_t* aPtr = (int32_t*)aVector;
- const int32_t* bPtr = (int32_t*)bVector;
+ int32_t* cPtr = (int32_t*)cVector;
+ const int32_t* aPtr = (int32_t*)aVector;
+ const int32_t* bPtr = (int32_t*)bVector;
- __m512i aVal, bVal, cVal;
- for(;number < sixteenthPoints; number++){
+ __m512i aVal, bVal, cVal;
+ for (; number < sixteenthPoints; number++) {
- aVal = _mm512_load_si512(aPtr);
- bVal = _mm512_load_si512(bPtr);
+ aVal = _mm512_load_si512(aPtr);
+ bVal = _mm512_load_si512(bPtr);
- cVal = _mm512_or_si512(aVal, bVal);
+ cVal = _mm512_or_si512(aVal, bVal);
- _mm512_store_si512(cPtr,cVal); // Store the results back into the C container
+ _mm512_store_si512(cPtr, cVal); // Store the results back into the C container
- aPtr += 16;
- bPtr += 16;
- cPtr += 16;
- }
+ aPtr += 16;
+ bPtr += 16;
+ cPtr += 16;
+ }
- number = sixteenthPoints * 16;
- for(;number < num_points; number++){
- cVector[number] = aVector[number] | bVector[number];
- }
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ cVector[number] = aVector[number] | bVector[number];
+ }
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_32i_x2_or_32i_a_avx2(int32_t* cVector, const int32_t* aVector,
- const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_or_32i_a_avx2(int32_t* cVector,
+ const int32_t* aVector,
+ const int32_t* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int oneEightPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int oneEightPoints = num_points / 8;
- int32_t* cPtr = cVector;
- const int32_t* aPtr = aVector;
- const int32_t* bPtr = bVector;
+ int32_t* cPtr = cVector;
+ const int32_t* aPtr = aVector;
+ const int32_t* bPtr = bVector;
- __m256i aVal, bVal, cVal;
- for(;number < oneEightPoints; number++){
+ __m256i aVal, bVal, cVal;
+ for (; number < oneEightPoints; number++) {
- aVal = _mm256_load_si256((__m256i*)aPtr);
- bVal = _mm256_load_si256((__m256i*)bPtr);
+ aVal = _mm256_load_si256((__m256i*)aPtr);
+ bVal = _mm256_load_si256((__m256i*)bPtr);
- cVal = _mm256_or_si256(aVal, bVal);
+ cVal = _mm256_or_si256(aVal, bVal);
- _mm256_store_si256((__m256i*)cPtr,cVal); // Store the results back into the C container
+ _mm256_store_si256((__m256i*)cPtr,
+ cVal); // Store the results back into the C container
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
- number = oneEightPoints * 8;
- for(;number < num_points; number++){
- cVector[number] = aVector[number] | bVector[number];
- }
+ number = oneEightPoints * 8;
+ for (; number < num_points; number++) {
+ cVector[number] = aVector[number] | bVector[number];
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVector,
- const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector,
+ const int32_t* aVector,
+ const int32_t* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- float* cPtr = (float*)cVector;
- const float* aPtr = (float*)aVector;
- const float* bPtr = (float*)bVector;
+ float* cPtr = (float*)cVector;
+ const float* aPtr = (float*)aVector;
+ const float* bPtr = (float*)bVector;
- __m128 aVal, bVal, cVal;
- for(;number < quarterPoints; number++){
- aVal = _mm_load_ps(aPtr);
- bVal = _mm_load_ps(bPtr);
+ __m128 aVal, bVal, cVal;
+ for (; number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
- cVal = _mm_or_ps(aVal, bVal);
+ cVal = _mm_or_ps(aVal, bVal);
- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- cVector[number] = aVector[number] | bVector[number];
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ cVector[number] = aVector[number] | bVector[number];
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_32i_x2_or_32i_neon(int32_t* cVector, const int32_t* aVector,
- const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_or_32i_neon(int32_t* cVector,
+ const int32_t* aVector,
+ const int32_t* bVector,
+ unsigned int num_points)
{
- int32_t* cPtr = cVector;
- const int32_t* aPtr = aVector;
- const int32_t* bPtr= bVector;
- unsigned int number = 0;
- unsigned int quarter_points = num_points / 4;
-
- int32x4_t a_val, b_val, c_val;
-
- for(number = 0; number < quarter_points; number++){
- a_val = vld1q_s32(aPtr);
- b_val = vld1q_s32(bPtr);
- c_val = vorrq_s32(a_val, b_val);
- vst1q_s32(cPtr, c_val);
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
-
- for(number = quarter_points * 4; number < num_points; number++){
- *cPtr++ = (*aPtr++) | (*bPtr++);
- }
+ int32_t* cPtr = cVector;
+ const int32_t* aPtr = aVector;
+ const int32_t* bPtr = bVector;
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+
+ int32x4_t a_val, b_val, c_val;
+
+ for (number = 0; number < quarter_points; number++) {
+ a_val = vld1q_s32(aPtr);
+ b_val = vld1q_s32(bPtr);
+ c_val = vorrq_s32(a_val, b_val);
+ vst1q_s32(cPtr, c_val);
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ for (number = quarter_points * 4; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) | (*bPtr++);
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32i_x2_or_32i_generic(int32_t* cVector, const int32_t* aVector,
- const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_or_32i_generic(int32_t* cVector,
+ const int32_t* aVector,
+ const int32_t* bVector,
+ unsigned int num_points)
{
- int32_t* cPtr = cVector;
- const int32_t* aPtr = aVector;
- const int32_t* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++) | (*bPtr++);
- }
+ int32_t* cPtr = cVector;
+ const int32_t* aPtr = aVector;
+ const int32_t* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) | (*bPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC
-extern void
-volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector,
- const int32_t* bVector, unsigned int num_points);
-
-static inline void
-volk_32i_x2_or_32i_u_orc(int32_t* cVector, const int32_t* aVector,
- const int32_t* bVector, unsigned int num_points)
+extern void volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector,
+ const int32_t* aVector,
+ const int32_t* bVector,
+ unsigned int num_points);
+
+static inline void volk_32i_x2_or_32i_u_orc(int32_t* cVector,
+ const int32_t* aVector,
+ const int32_t* bVector,
+ unsigned int num_points)
{
- volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points);
+ volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_ORC */
#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void
-volk_32i_x2_or_32i_u_avx512f(int32_t* cVector, const int32_t* aVector,
- const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_or_32i_u_avx512f(int32_t* cVector,
+ const int32_t* aVector,
+ const int32_t* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- int32_t* cPtr = (int32_t*)cVector;
- const int32_t* aPtr = (int32_t*)aVector;
- const int32_t* bPtr = (int32_t*)bVector;
+ int32_t* cPtr = (int32_t*)cVector;
+ const int32_t* aPtr = (int32_t*)aVector;
+ const int32_t* bPtr = (int32_t*)bVector;
- __m512i aVal, bVal, cVal;
- for(;number < sixteenthPoints; number++){
+ __m512i aVal, bVal, cVal;
+ for (; number < sixteenthPoints; number++) {
- aVal = _mm512_loadu_si512(aPtr);
- bVal = _mm512_loadu_si512(bPtr);
+ aVal = _mm512_loadu_si512(aPtr);
+ bVal = _mm512_loadu_si512(bPtr);
- cVal = _mm512_or_si512(aVal, bVal);
+ cVal = _mm512_or_si512(aVal, bVal);
- _mm512_storeu_si512(cPtr,cVal); // Store the results back into the C container
+ _mm512_storeu_si512(cPtr, cVal); // Store the results back into the C container
- aPtr += 16;
- bPtr += 16;
- cPtr += 16;
- }
+ aPtr += 16;
+ bPtr += 16;
+ cPtr += 16;
+ }
- number = sixteenthPoints * 16;
- for(;number < num_points; number++){
- cVector[number] = aVector[number] | bVector[number];
- }
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ cVector[number] = aVector[number] | bVector[number];
+ }
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_32i_x2_or_32i_u_avx2(int32_t* cVector, const int32_t* aVector,
- const int32_t* bVector, unsigned int num_points)
+static inline void volk_32i_x2_or_32i_u_avx2(int32_t* cVector,
+ const int32_t* aVector,
+ const int32_t* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int oneEightPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int oneEightPoints = num_points / 8;
- int32_t* cPtr = cVector;
- const int32_t* aPtr = aVector;
- const int32_t* bPtr = bVector;
+ int32_t* cPtr = cVector;
+ const int32_t* aPtr = aVector;
+ const int32_t* bPtr = bVector;
- __m256i aVal, bVal, cVal;
- for(;number < oneEightPoints; number++){
+ __m256i aVal, bVal, cVal;
+ for (; number < oneEightPoints; number++) {
- aVal = _mm256_loadu_si256((__m256i*)aPtr);
- bVal = _mm256_loadu_si256((__m256i*)bPtr);
+ aVal = _mm256_loadu_si256((__m256i*)aPtr);
+ bVal = _mm256_loadu_si256((__m256i*)bPtr);
- cVal = _mm256_or_si256(aVal, bVal);
+ cVal = _mm256_or_si256(aVal, bVal);
- _mm256_storeu_si256((__m256i*)cPtr,cVal); // Store the results back into the C container
+ _mm256_storeu_si256((__m256i*)cPtr,
+ cVal); // Store the results back into the C container
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
- number = oneEightPoints * 8;
- for(;number < num_points; number++){
- cVector[number] = aVector[number] | bVector[number];
- }
+ number = oneEightPoints * 8;
+ for (; number < num_points; number++) {
+ cVector[number] = aVector[number] | bVector[number];
+ }
}
#endif /* LV_HAVE_AVX2 */
#if LV_HAVE_AVX2
#include <immintrin.h>
-static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap, unsigned int num_points){
+static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap, unsigned int num_points)
+{
- unsigned int number;
+ unsigned int number;
- const unsigned int nPerSet = 8;
- const uint64_t nSets = num_points / nPerSet;
+ const unsigned int nPerSet = 8;
+ const uint64_t nSets = num_points / nPerSet;
- uint32_t* inputPtr = intsToSwap;
+ uint32_t* inputPtr = intsToSwap;
- const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
+ const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
+ 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
+ 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
- const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector);
+ const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector);
- for (number = 0 ;number < nSets; number++) {
+ for (number = 0; number < nSets; number++) {
- // Load the 32t values, increment inputPtr later since we're doing it in-place.
- const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
- const __m256i output = _mm256_shuffle_epi8(input,myShuffle);
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
+ const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
- // Store the results
- _mm256_storeu_si256((__m256i*)inputPtr, output);
- inputPtr += nPerSet;
- }
- _mm256_zeroupper();
-
- // Byteswap any remaining points:
- for(number = nSets * nPerSet; number < num_points; number++){
- uint32_t outputVal = *inputPtr;
- outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
- *inputPtr = outputVal;
- inputPtr++;
- }
+ // Store the results
+ _mm256_storeu_si256((__m256i*)inputPtr, output);
+ inputPtr += nPerSet;
+ }
+ _mm256_zeroupper();
+
+ // Byteswap any remaining points:
+ for (number = nSets * nPerSet; number < num_points; number++) {
+ uint32_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
+ ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points){
- unsigned int number = 0;
-
- uint32_t* inputPtr = intsToSwap;
- __m128i input, byte1, byte2, byte3, byte4, output;
- __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
- __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
-
- const uint64_t quarterPoints = num_points / 4;
- for(;number < quarterPoints; number++){
- // Load the 32t values, increment inputPtr later since we're doing it in-place.
- input = _mm_loadu_si128((__m128i*)inputPtr);
- // Do the four shifts
- byte1 = _mm_slli_epi32(input, 24);
- byte2 = _mm_slli_epi32(input, 8);
- byte3 = _mm_srli_epi32(input, 8);
- byte4 = _mm_srli_epi32(input, 24);
- // Or bytes together
- output = _mm_or_si128(byte1, byte4);
- byte2 = _mm_and_si128(byte2, byte2mask);
- output = _mm_or_si128(output, byte2);
- byte3 = _mm_and_si128(byte3, byte3mask);
- output = _mm_or_si128(output, byte3);
- // Store the results
- _mm_storeu_si128((__m128i*)inputPtr, output);
- inputPtr += 4;
- }
-
- // Byteswap any remaining points:
- number = quarterPoints*4;
- for(; number < num_points; number++){
- uint32_t outputVal = *inputPtr;
- outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
- *inputPtr = outputVal;
- inputPtr++;
- }
+static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ uint32_t* inputPtr = intsToSwap;
+ __m128i input, byte1, byte2, byte3, byte4, output;
+ __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+ __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+
+ const uint64_t quarterPoints = num_points / 4;
+ for (; number < quarterPoints; number++) {
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_loadu_si128((__m128i*)inputPtr);
+ // Do the four shifts
+ byte1 = _mm_slli_epi32(input, 24);
+ byte2 = _mm_slli_epi32(input, 8);
+ byte3 = _mm_srli_epi32(input, 8);
+ byte4 = _mm_srli_epi32(input, 24);
+ // Or bytes together
+ output = _mm_or_si128(byte1, byte4);
+ byte2 = _mm_and_si128(byte2, byte2mask);
+ output = _mm_or_si128(output, byte2);
+ byte3 = _mm_and_si128(byte3, byte3mask);
+ output = _mm_or_si128(output, byte3);
+ // Store the results
+ _mm_storeu_si128((__m128i*)inputPtr, output);
+ inputPtr += 4;
+ }
+
+ // Byteswap any remaining points:
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ uint32_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
+ ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void volk_32u_byteswap_neon(uint32_t* intsToSwap, unsigned int num_points){
- uint32_t* inputPtr = intsToSwap;
- unsigned int number = 0;
- unsigned int n8points = num_points / 8;
-
- uint8x8x4_t input_table;
- uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
- uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
-
- /* these magic numbers are used as byte-indices in the LUT.
- they are pre-computed to save time. A simple C program
- can calculate them; for example for lookup01:
- uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
- for(ii=0; ii < 8; ++ii) {
- index += ((uint64_t)(*(chars+ii))) << (ii*8);
+static inline void volk_32u_byteswap_neon(uint32_t* intsToSwap, unsigned int num_points)
+{
+ uint32_t* inputPtr = intsToSwap;
+ unsigned int number = 0;
+ unsigned int n8points = num_points / 8;
+
+ uint8x8x4_t input_table;
+ uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
+ uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
+
+ /* these magic numbers are used as byte-indices in the LUT.
+ they are pre-computed to save time. A simple C program
+ can calculate them; for example for lookup01:
+ uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
+ for(ii=0; ii < 8; ++ii) {
+ index += ((uint64_t)(*(chars+ii))) << (ii*8);
+ }
+ */
+ int_lookup01 = vcreate_u8(74609667900706840);
+ int_lookup23 = vcreate_u8(219290013576860186);
+ int_lookup45 = vcreate_u8(363970359253013532);
+ int_lookup67 = vcreate_u8(508650704929166878);
+
+ for (number = 0; number < n8points; ++number) {
+ input_table = vld4_u8((uint8_t*)inputPtr);
+ swapped_int01 = vtbl4_u8(input_table, int_lookup01);
+ swapped_int23 = vtbl4_u8(input_table, int_lookup23);
+ swapped_int45 = vtbl4_u8(input_table, int_lookup45);
+ swapped_int67 = vtbl4_u8(input_table, int_lookup67);
+ vst1_u8((uint8_t*)inputPtr, swapped_int01);
+ vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23);
+ vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45);
+ vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67);
+
+ inputPtr += 8;
+ }
+
+ for (number = n8points * 8; number < num_points; ++number) {
+ uint32_t output = *inputPtr;
+ output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
+ ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+
+ *inputPtr = output;
+ inputPtr++;
}
- */
- int_lookup01 = vcreate_u8(74609667900706840);
- int_lookup23 = vcreate_u8(219290013576860186);
- int_lookup45 = vcreate_u8(363970359253013532);
- int_lookup67 = vcreate_u8(508650704929166878);
-
- for(number = 0; number < n8points; ++number){
- input_table = vld4_u8((uint8_t*) inputPtr);
- swapped_int01 = vtbl4_u8(input_table, int_lookup01);
- swapped_int23 = vtbl4_u8(input_table, int_lookup23);
- swapped_int45 = vtbl4_u8(input_table, int_lookup45);
- swapped_int67 = vtbl4_u8(input_table, int_lookup67);
- vst1_u8((uint8_t*) inputPtr, swapped_int01);
- vst1_u8((uint8_t*) (inputPtr+2), swapped_int23);
- vst1_u8((uint8_t*) (inputPtr+4), swapped_int45);
- vst1_u8((uint8_t*) (inputPtr+6), swapped_int67);
-
- inputPtr += 8;
- }
-
- for(number = n8points * 8; number < num_points; ++number){
- uint32_t output = *inputPtr;
- output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
-
- *inputPtr = output;
- inputPtr++;
- }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_NEONV8
#include <arm_neon.h>
-static inline void volk_32u_byteswap_neonv8(uint32_t* intsToSwap, unsigned int num_points){
- uint32_t* inputPtr = (uint32_t*)intsToSwap;
- const unsigned int n8points = num_points / 8;
- uint8x16_t input;
- uint8x16_t idx = { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 };
-
- unsigned int number = 0;
- for(number = 0; number < n8points; ++number){
- __VOLK_PREFETCH(inputPtr+8);
- input = vld1q_u8((uint8_t*) inputPtr);
- input = vqtbl1q_u8(input, idx);
- vst1q_u8((uint8_t*) inputPtr, input);
- inputPtr += 4;
-
- input = vld1q_u8((uint8_t*) inputPtr);
- input = vqtbl1q_u8(input, idx);
- vst1q_u8((uint8_t*) inputPtr, input);
- inputPtr += 4;
- }
-
- for(number = n8points * 8; number < num_points; ++number){
- uint32_t output = *inputPtr;
+static inline void volk_32u_byteswap_neonv8(uint32_t* intsToSwap, unsigned int num_points)
+{
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ const unsigned int n8points = num_points / 8;
+ uint8x16_t input;
+ uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+
+ unsigned int number = 0;
+ for (number = 0; number < n8points; ++number) {
+ __VOLK_PREFETCH(inputPtr + 8);
+ input = vld1q_u8((uint8_t*)inputPtr);
+ input = vqtbl1q_u8(input, idx);
+ vst1q_u8((uint8_t*)inputPtr, input);
+ inputPtr += 4;
+
+ input = vld1q_u8((uint8_t*)inputPtr);
+ input = vqtbl1q_u8(input, idx);
+ vst1q_u8((uint8_t*)inputPtr, input);
+ inputPtr += 4;
+ }
- output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+ for (number = n8points * 8; number < num_points; ++number) {
+ uint32_t output = *inputPtr;
- *inputPtr++ = output;
- }
+ output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
+ ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+ *inputPtr++ = output;
+ }
}
#endif /* LV_HAVE_NEONV8 */
#ifdef LV_HAVE_GENERIC
-static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap, unsigned int num_points){
- uint32_t* inputPtr = intsToSwap;
+static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap,
+ unsigned int num_points)
+{
+ uint32_t* inputPtr = intsToSwap;
- unsigned int point;
- for(point = 0; point < num_points; point++){
- uint32_t output = *inputPtr;
- output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+ unsigned int point;
+ for (point = 0; point < num_points; point++) {
+ uint32_t output = *inputPtr;
+ output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
+ ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
- *inputPtr = output;
- inputPtr++;
- }
+ *inputPtr = output;
+ inputPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
#if LV_HAVE_AVX2
#include <immintrin.h>
-static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap, unsigned int num_points){
+static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap, unsigned int num_points)
+{
- unsigned int number;
+ unsigned int number;
- const unsigned int nPerSet = 8;
- const uint64_t nSets = num_points / nPerSet;
+ const unsigned int nPerSet = 8;
+ const uint64_t nSets = num_points / nPerSet;
- uint32_t* inputPtr = intsToSwap;
+ uint32_t* inputPtr = intsToSwap;
- const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
+ const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
+ 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
+ 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
- const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector);
+ const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector);
- for (number = 0 ;number < nSets; number++) {
+ for (number = 0; number < nSets; number++) {
- // Load the 32t values, increment inputPtr later since we're doing it in-place.
- const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
- const __m256i output = _mm256_shuffle_epi8(input,myShuffle);
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
+ const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
- // Store the results
- _mm256_store_si256((__m256i*)inputPtr, output);
- inputPtr += nPerSet;
- }
- _mm256_zeroupper();
-
- // Byteswap any remaining points:
- for(number = nSets * nPerSet; number < num_points; number++){
- uint32_t outputVal = *inputPtr;
- outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
- *inputPtr = outputVal;
- inputPtr++;
- }
+ // Store the results
+ _mm256_store_si256((__m256i*)inputPtr, output);
+ inputPtr += nPerSet;
+ }
+ _mm256_zeroupper();
+
+ // Byteswap any remaining points:
+ for (number = nSets * nPerSet; number < num_points; number++) {
+ uint32_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
+ ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
#include <emmintrin.h>
-static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int num_points){
- unsigned int number = 0;
-
- uint32_t* inputPtr = intsToSwap;
- __m128i input, byte1, byte2, byte3, byte4, output;
- __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
- __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
-
- const uint64_t quarterPoints = num_points / 4;
- for(;number < quarterPoints; number++){
- // Load the 32t values, increment inputPtr later since we're doing it in-place.
- input = _mm_load_si128((__m128i*)inputPtr);
- // Do the four shifts
- byte1 = _mm_slli_epi32(input, 24);
- byte2 = _mm_slli_epi32(input, 8);
- byte3 = _mm_srli_epi32(input, 8);
- byte4 = _mm_srli_epi32(input, 24);
- // Or bytes together
- output = _mm_or_si128(byte1, byte4);
- byte2 = _mm_and_si128(byte2, byte2mask);
- output = _mm_or_si128(output, byte2);
- byte3 = _mm_and_si128(byte3, byte3mask);
- output = _mm_or_si128(output, byte3);
- // Store the results
- _mm_store_si128((__m128i*)inputPtr, output);
- inputPtr += 4;
- }
-
- // Byteswap any remaining points:
- number = quarterPoints*4;
- for(; number < num_points; number++){
- uint32_t outputVal = *inputPtr;
- outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
- *inputPtr = outputVal;
- inputPtr++;
- }
+static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ uint32_t* inputPtr = intsToSwap;
+ __m128i input, byte1, byte2, byte3, byte4, output;
+ __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+ __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+
+ const uint64_t quarterPoints = num_points / 4;
+ for (; number < quarterPoints; number++) {
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_load_si128((__m128i*)inputPtr);
+ // Do the four shifts
+ byte1 = _mm_slli_epi32(input, 24);
+ byte2 = _mm_slli_epi32(input, 8);
+ byte3 = _mm_srli_epi32(input, 8);
+ byte4 = _mm_srli_epi32(input, 24);
+ // Or bytes together
+ output = _mm_or_si128(byte1, byte4);
+ byte2 = _mm_and_si128(byte2, byte2mask);
+ output = _mm_or_si128(output, byte2);
+ byte3 = _mm_and_si128(byte3, byte3mask);
+ output = _mm_or_si128(output, byte3);
+ // Store the results
+ _mm_store_si128((__m128i*)inputPtr, output);
+ inputPtr += 4;
+ }
+
+ // Byteswap any remaining points:
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ uint32_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
+ ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
-static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap, unsigned int num_points){
- uint32_t* inputPtr = intsToSwap;
+static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap,
+ unsigned int num_points)
+{
+ uint32_t* inputPtr = intsToSwap;
- unsigned int point;
- for(point = 0; point < num_points; point++){
- uint32_t output = *inputPtr;
- output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+ unsigned int point;
+ for (point = 0; point < num_points; point++) {
+ uint32_t output = *inputPtr;
+ output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
+ ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
- *inputPtr = output;
- inputPtr++;
- }
+ *inputPtr = output;
+ inputPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
-
-
#endif /* INCLUDED_volk_32u_byteswap_a_H */
#ifndef INCLUDED_volk_32u_byteswappuppet_32u_H
#define INCLUDED_volk_32u_byteswappuppet_32u_H
-#include <volk/volk_32u_byteswap.h>
#include <stdint.h>
#include <string.h>
+#include <volk/volk_32u_byteswap.h>
#ifdef LV_HAVE_GENERIC
-static inline void volk_32u_byteswappuppet_32u_generic(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){
+static inline void volk_32u_byteswappuppet_32u_generic(uint32_t* output,
+ uint32_t* intsToSwap,
+ unsigned int num_points)
+{
volk_32u_byteswap_generic((uint32_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
-
}
#endif
#ifdef LV_HAVE_NEON
-static inline void volk_32u_byteswappuppet_32u_neon(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){
+static inline void volk_32u_byteswappuppet_32u_neon(uint32_t* output,
+ uint32_t* intsToSwap,
+ unsigned int num_points)
+{
volk_32u_byteswap_neon((uint32_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
-
}
#endif
#ifdef LV_HAVE_NEONV8
-static inline void volk_32u_byteswappuppet_32u_neonv8(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){
+static inline void volk_32u_byteswappuppet_32u_neonv8(uint32_t* output,
+ uint32_t* intsToSwap,
+ unsigned int num_points)
+{
volk_32u_byteswap_neonv8((uint32_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
-
}
#endif
#ifdef LV_HAVE_SSE2
-static inline void volk_32u_byteswappuppet_32u_u_sse2(uint32_t *output, uint32_t* intsToSwap, unsigned int num_points){
+static inline void volk_32u_byteswappuppet_32u_u_sse2(uint32_t* output,
+ uint32_t* intsToSwap,
+ unsigned int num_points)
+{
volk_32u_byteswap_u_sse2((uint32_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
-
}
#endif
#ifdef LV_HAVE_SSE2
-static inline void volk_32u_byteswappuppet_32u_a_sse2(uint32_t* output, uint32_t* intsToSwap, unsigned int num_points){
+static inline void volk_32u_byteswappuppet_32u_a_sse2(uint32_t* output,
+ uint32_t* intsToSwap,
+ unsigned int num_points)
+{
volk_32u_byteswap_a_sse2((uint32_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
-
}
#endif
#ifdef LV_HAVE_AVX2
-static inline void volk_32u_byteswappuppet_32u_u_avx2(uint32_t* output, uint32_t* intsToSwap, unsigned int num_points){
+static inline void volk_32u_byteswappuppet_32u_u_avx2(uint32_t* output,
+ uint32_t* intsToSwap,
+ unsigned int num_points)
+{
volk_32u_byteswap_u_avx2((uint32_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
-
}
#endif
#ifdef LV_HAVE_AVX2
-static inline void volk_32u_byteswappuppet_32u_a_avx2(uint32_t* output, uint32_t* intsToSwap, unsigned int num_points){
+static inline void volk_32u_byteswappuppet_32u_a_avx2(uint32_t* output,
+ uint32_t* intsToSwap,
+ unsigned int num_points)
+{
volk_32u_byteswap_a_avx2((uint32_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
-
}
#endif
#ifndef INCLUDED_VOLK_32u_POPCNT_A16_H
#define INCLUDED_VOLK_32u_POPCNT_A16_H
-#include <stdio.h>
#include <inttypes.h>
+#include <stdio.h>
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value)
+static inline void volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value)
{
- // This is faster than a lookup table
- uint32_t retVal = value;
+ // This is faster than a lookup table
+ uint32_t retVal = value;
- retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
- retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
- retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
- retVal = (retVal + (retVal >> 8));
- retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+ retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+ retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+ retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+ retVal = (retVal + (retVal >> 8));
+ retVal = (retVal + (retVal >> 16)) & 0x0000003F;
- *ret = retVal;
+ *ret = retVal;
}
#endif /*LV_HAVE_GENERIC*/
#include <nmmintrin.h>
-static inline void
-volk_32u_popcnt_a_sse4_2(uint32_t* ret, const uint32_t value)
+static inline void volk_32u_popcnt_a_sse4_2(uint32_t* ret, const uint32_t value)
{
- *ret = _mm_popcnt_u32(value);
+ *ret = _mm_popcnt_u32(value);
}
#endif /*LV_HAVE_SSE4_2*/
#include <volk/volk_32u_popcnt.h>
#ifdef LV_HAVE_GENERIC
-static inline void volk_32u_popcntpuppet_32u_generic(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points){
+static inline void volk_32u_popcntpuppet_32u_generic(uint32_t* outVector,
+ const uint32_t* inVector,
+ unsigned int num_points)
+{
unsigned int ii;
- for(ii=0; ii < num_points; ++ii) {
- volk_32u_popcnt_generic(outVector+ii, *(inVector+ii) );
+ for (ii = 0; ii < num_points; ++ii) {
+ volk_32u_popcnt_generic(outVector + ii, *(inVector + ii));
}
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE4_2
-static inline void volk_32u_popcntpuppet_32u_a_sse4_2(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points){
+static inline void volk_32u_popcntpuppet_32u_a_sse4_2(uint32_t* outVector,
+ const uint32_t* inVector,
+ unsigned int num_points)
+{
unsigned int ii;
- for(ii=0; ii < num_points; ++ii) {
- volk_32u_popcnt_a_sse4_2(outVector+ii, *(inVector+ii) );
+ for (ii = 0; ii < num_points; ++ii) {
+ volk_32u_popcnt_a_sse4_2(outVector + ii, *(inVector + ii));
}
}
#endif /* LV_HAVE_SSE4_2 */
* \b bit reversal of the input 32 bit word
* <b>Dispatcher Prototype</b>
- * \code volk_32u_reverse_32u(uint32_t *outputVector, uint32_t *inputVector; unsigned int num_points);
+ * \code volk_32u_reverse_32u(uint32_t *outputVector, uint32_t *inputVector; unsigned int
+ num_points);
* \endcode
*
* \b Inputs
* \li num_points The number of data points.
*
* \b Outputs
- * \li outputVector: The vector where the results will be stored, which is the bit-reversed input
+ * \li outputVector: The vector where the results will be stored, which is the
+ bit-reversed input
*
* \endcode
*/
#ifndef INCLUDED_VOLK_32u_REVERSE_32u_U_H
struct dword_split {
- int b00: 1;
- int b01: 1;
- int b02: 1;
- int b03: 1;
- int b04: 1;
- int b05: 1;
- int b06: 1;
- int b07: 1;
- int b08: 1;
- int b09: 1;
- int b10: 1;
- int b11: 1;
- int b12: 1;
- int b13: 1;
- int b14: 1;
- int b15: 1;
- int b16: 1;
- int b17: 1;
- int b18: 1;
- int b19: 1;
- int b20: 1;
- int b21: 1;
- int b22: 1;
- int b23: 1;
- int b24: 1;
- int b25: 1;
- int b26: 1;
- int b27: 1;
- int b28: 1;
- int b29: 1;
- int b30: 1;
- int b31: 1;
+ int b00 : 1;
+ int b01 : 1;
+ int b02 : 1;
+ int b03 : 1;
+ int b04 : 1;
+ int b05 : 1;
+ int b06 : 1;
+ int b07 : 1;
+ int b08 : 1;
+ int b09 : 1;
+ int b10 : 1;
+ int b11 : 1;
+ int b12 : 1;
+ int b13 : 1;
+ int b14 : 1;
+ int b15 : 1;
+ int b16 : 1;
+ int b17 : 1;
+ int b18 : 1;
+ int b19 : 1;
+ int b20 : 1;
+ int b21 : 1;
+ int b22 : 1;
+ int b23 : 1;
+ int b24 : 1;
+ int b25 : 1;
+ int b26 : 1;
+ int b27 : 1;
+ int b28 : 1;
+ int b29 : 1;
+ int b30 : 1;
+ int b31 : 1;
};
struct char_split {
- uint8_t b00: 1;
- uint8_t b01: 1;
- uint8_t b02: 1;
- uint8_t b03: 1;
- uint8_t b04: 1;
- uint8_t b05: 1;
- uint8_t b06: 1;
- uint8_t b07: 1;
+ uint8_t b00 : 1;
+ uint8_t b01 : 1;
+ uint8_t b02 : 1;
+ uint8_t b03 : 1;
+ uint8_t b04 : 1;
+ uint8_t b05 : 1;
+ uint8_t b06 : 1;
+ uint8_t b07 : 1;
};
-//Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
-//http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
+// Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
+// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
static const unsigned char BitReverseTable256[] = {
- 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30,
- 0xB0, 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98,
- 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64,
- 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC,
- 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02,
- 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2,
- 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A,
- 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6,
- 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, 0x0E, 0x8E, 0x4E, 0xCE, 0x2E,
- 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE, 0x01, 0x81,
- 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71,
- 0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9,
- 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15,
- 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD,
- 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43,
- 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,
- 0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B,
- 0xBB, 0x7B, 0xFB, 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97,
- 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F,
- 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF
+ 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0,
+ 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8,
+ 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94,
+ 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC,
+ 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2,
+ 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA,
+ 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86,
+ 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
+ 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE,
+ 0x7E, 0xFE, 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1,
+ 0x31, 0xB1, 0x71, 0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99,
+ 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5,
+ 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD,
+ 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43, 0xC3,
+ 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 0x0B, 0x8B,
+ 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
+ 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7,
+ 0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF,
+ 0x3F, 0xBF, 0x7F, 0xFF
};
#ifdef LV_HAVE_GENERIC
-static inline void volk_32u_reverse_32u_dword_shuffle(uint32_t* out, const uint32_t* in,
- unsigned int num_points)
+static inline void volk_32u_reverse_32u_dword_shuffle(uint32_t* out,
+ const uint32_t* in,
+ unsigned int num_points)
{
- const struct dword_split *in_ptr = (const struct dword_split*)in;
- struct dword_split * out_ptr = (struct dword_split*)out;
- unsigned int number = 0;
- for(; number < num_points; ++number){
- out_ptr->b00 = in_ptr->b31;
- out_ptr->b01 = in_ptr->b30;
- out_ptr->b02 = in_ptr->b29;
- out_ptr->b03 = in_ptr->b28;
- out_ptr->b04 = in_ptr->b27;
- out_ptr->b05 = in_ptr->b26;
- out_ptr->b06 = in_ptr->b25;
- out_ptr->b07 = in_ptr->b24;
- out_ptr->b08 = in_ptr->b23;
- out_ptr->b09 = in_ptr->b22;
- out_ptr->b10 = in_ptr->b21;
- out_ptr->b11 = in_ptr->b20;
- out_ptr->b12 = in_ptr->b19;
- out_ptr->b13 = in_ptr->b18;
- out_ptr->b14 = in_ptr->b17;
- out_ptr->b15 = in_ptr->b16;
- out_ptr->b16 = in_ptr->b15;
- out_ptr->b17 = in_ptr->b14;
- out_ptr->b18 = in_ptr->b13;
- out_ptr->b19 = in_ptr->b12;
- out_ptr->b20 = in_ptr->b11;
- out_ptr->b21 = in_ptr->b10;
- out_ptr->b22 = in_ptr->b09;
- out_ptr->b23 = in_ptr->b08;
- out_ptr->b24 = in_ptr->b07;
- out_ptr->b25 = in_ptr->b06;
- out_ptr->b26 = in_ptr->b05;
- out_ptr->b27 = in_ptr->b04;
- out_ptr->b28 = in_ptr->b03;
- out_ptr->b29 = in_ptr->b02;
- out_ptr->b30 = in_ptr->b01;
- out_ptr->b31 = in_ptr->b00;
- ++in_ptr;
- ++out_ptr;
- }
+ const struct dword_split* in_ptr = (const struct dword_split*)in;
+ struct dword_split* out_ptr = (struct dword_split*)out;
+ unsigned int number = 0;
+ for (; number < num_points; ++number) {
+ out_ptr->b00 = in_ptr->b31;
+ out_ptr->b01 = in_ptr->b30;
+ out_ptr->b02 = in_ptr->b29;
+ out_ptr->b03 = in_ptr->b28;
+ out_ptr->b04 = in_ptr->b27;
+ out_ptr->b05 = in_ptr->b26;
+ out_ptr->b06 = in_ptr->b25;
+ out_ptr->b07 = in_ptr->b24;
+ out_ptr->b08 = in_ptr->b23;
+ out_ptr->b09 = in_ptr->b22;
+ out_ptr->b10 = in_ptr->b21;
+ out_ptr->b11 = in_ptr->b20;
+ out_ptr->b12 = in_ptr->b19;
+ out_ptr->b13 = in_ptr->b18;
+ out_ptr->b14 = in_ptr->b17;
+ out_ptr->b15 = in_ptr->b16;
+ out_ptr->b16 = in_ptr->b15;
+ out_ptr->b17 = in_ptr->b14;
+ out_ptr->b18 = in_ptr->b13;
+ out_ptr->b19 = in_ptr->b12;
+ out_ptr->b20 = in_ptr->b11;
+ out_ptr->b21 = in_ptr->b10;
+ out_ptr->b22 = in_ptr->b09;
+ out_ptr->b23 = in_ptr->b08;
+ out_ptr->b24 = in_ptr->b07;
+ out_ptr->b25 = in_ptr->b06;
+ out_ptr->b26 = in_ptr->b05;
+ out_ptr->b27 = in_ptr->b04;
+ out_ptr->b28 = in_ptr->b03;
+ out_ptr->b29 = in_ptr->b02;
+ out_ptr->b30 = in_ptr->b01;
+ out_ptr->b31 = in_ptr->b00;
+ ++in_ptr;
+ ++out_ptr;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_GENERIC
-static inline void volk_32u_reverse_32u_byte_shuffle(uint32_t* out, const uint32_t* in,
- unsigned int num_points)
+static inline void volk_32u_reverse_32u_byte_shuffle(uint32_t* out,
+ const uint32_t* in,
+ unsigned int num_points)
{
- const uint32_t *in_ptr = in;
- uint32_t *out_ptr = out;
- unsigned int number = 0;
- for(; number < num_points; ++number){
- const struct char_split *in8 = (const struct char_split*)in_ptr;
- struct char_split *out8 = (struct char_split*)out_ptr;
+ const uint32_t* in_ptr = in;
+ uint32_t* out_ptr = out;
+ unsigned int number = 0;
+ for (; number < num_points; ++number) {
+ const struct char_split* in8 = (const struct char_split*)in_ptr;
+ struct char_split* out8 = (struct char_split*)out_ptr;
- out8[3].b00 = in8[0].b07;
- out8[3].b01 = in8[0].b06;
- out8[3].b02 = in8[0].b05;
- out8[3].b03 = in8[0].b04;
- out8[3].b04 = in8[0].b03;
- out8[3].b05 = in8[0].b02;
- out8[3].b06 = in8[0].b01;
- out8[3].b07 = in8[0].b00;
+ out8[3].b00 = in8[0].b07;
+ out8[3].b01 = in8[0].b06;
+ out8[3].b02 = in8[0].b05;
+ out8[3].b03 = in8[0].b04;
+ out8[3].b04 = in8[0].b03;
+ out8[3].b05 = in8[0].b02;
+ out8[3].b06 = in8[0].b01;
+ out8[3].b07 = in8[0].b00;
- out8[2].b00 = in8[1].b07;
- out8[2].b01 = in8[1].b06;
- out8[2].b02 = in8[1].b05;
- out8[2].b03 = in8[1].b04;
- out8[2].b04 = in8[1].b03;
- out8[2].b05 = in8[1].b02;
- out8[2].b06 = in8[1].b01;
- out8[2].b07 = in8[1].b00;
+ out8[2].b00 = in8[1].b07;
+ out8[2].b01 = in8[1].b06;
+ out8[2].b02 = in8[1].b05;
+ out8[2].b03 = in8[1].b04;
+ out8[2].b04 = in8[1].b03;
+ out8[2].b05 = in8[1].b02;
+ out8[2].b06 = in8[1].b01;
+ out8[2].b07 = in8[1].b00;
- out8[1].b00 = in8[2].b07;
- out8[1].b01 = in8[2].b06;
- out8[1].b02 = in8[2].b05;
- out8[1].b03 = in8[2].b04;
- out8[1].b04 = in8[2].b03;
- out8[1].b05 = in8[2].b02;
- out8[1].b06 = in8[2].b01;
- out8[1].b07 = in8[2].b00;
+ out8[1].b00 = in8[2].b07;
+ out8[1].b01 = in8[2].b06;
+ out8[1].b02 = in8[2].b05;
+ out8[1].b03 = in8[2].b04;
+ out8[1].b04 = in8[2].b03;
+ out8[1].b05 = in8[2].b02;
+ out8[1].b06 = in8[2].b01;
+ out8[1].b07 = in8[2].b00;
- out8[0].b00 = in8[3].b07;
- out8[0].b01 = in8[3].b06;
- out8[0].b02 = in8[3].b05;
- out8[0].b03 = in8[3].b04;
- out8[0].b04 = in8[3].b03;
- out8[0].b05 = in8[3].b02;
- out8[0].b06 = in8[3].b01;
- out8[0].b07 = in8[3].b00;
- ++in_ptr;
- ++out_ptr;
- }
+ out8[0].b00 = in8[3].b07;
+ out8[0].b01 = in8[3].b06;
+ out8[0].b02 = in8[3].b05;
+ out8[0].b03 = in8[3].b04;
+ out8[0].b04 = in8[3].b03;
+ out8[0].b05 = in8[3].b02;
+ out8[0].b06 = in8[3].b01;
+ out8[0].b07 = in8[3].b00;
+ ++in_ptr;
+ ++out_ptr;
+ }
}
#endif /* LV_HAVE_GENERIC */
-//Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
-//http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
+// Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
+// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
#ifdef LV_HAVE_GENERIC
-static inline void volk_32u_reverse_32u_lut(uint32_t* out, const uint32_t* in,
- unsigned int num_points)
+static inline void
+volk_32u_reverse_32u_lut(uint32_t* out, const uint32_t* in, unsigned int num_points)
{
- const uint32_t *in_ptr = in;
- uint32_t *out_ptr = out;
- unsigned int number = 0;
- for(; number < num_points; ++number){
- *out_ptr =
- (BitReverseTable256[*in_ptr & 0xff] << 24) |
- (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) |
- (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) |
- (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
- ++in_ptr;
- ++out_ptr;
- }
+ const uint32_t* in_ptr = in;
+ uint32_t* out_ptr = out;
+ unsigned int number = 0;
+ for (; number < num_points; ++number) {
+ *out_ptr = (BitReverseTable256[*in_ptr & 0xff] << 24) |
+ (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) |
+ (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) |
+ (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
+ ++in_ptr;
+ ++out_ptr;
+ }
}
#endif /* LV_HAVE_GENERIC */
-//Single-Byte code from "Bit Twiddling Hacks", which dedicates this method to public domain
-//http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
+// Single-Byte code from "Bit Twiddling Hacks", which dedicates this method to public
+// domain http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
#ifdef LV_HAVE_GENERIC
-static inline void volk_32u_reverse_32u_2001magic(uint32_t* out, const uint32_t* in,
- unsigned int num_points)
+static inline void
+volk_32u_reverse_32u_2001magic(uint32_t* out, const uint32_t* in, unsigned int num_points)
{
- const uint32_t *in_ptr = in;
- uint32_t *out_ptr = out;
- const uint8_t *in8;
- uint8_t *out8;
- unsigned int number = 0;
- for(; number < num_points; ++number){
- in8 = (const uint8_t*)in_ptr;
- out8 = (uint8_t*)out_ptr;
- out8[3] = ((in8[0] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
- out8[2] = ((in8[1] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
- out8[1] = ((in8[2] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
- out8[0] = ((in8[3] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
- ++in_ptr;
- ++out_ptr;
- }
+ const uint32_t* in_ptr = in;
+ uint32_t* out_ptr = out;
+ const uint8_t* in8;
+ uint8_t* out8;
+ unsigned int number = 0;
+ for (; number < num_points; ++number) {
+ in8 = (const uint8_t*)in_ptr;
+ out8 = (uint8_t*)out_ptr;
+ out8[3] = ((in8[0] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
+ out8[2] = ((in8[1] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
+ out8[1] = ((in8[2] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
+ out8[0] = ((in8[3] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
+ ++in_ptr;
+ ++out_ptr;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_GENERIC
// Current gr-pager implementation
-static inline void volk_32u_reverse_32u_1972magic(uint32_t* out, const uint32_t* in,
- unsigned int num_points)
+static inline void
+volk_32u_reverse_32u_1972magic(uint32_t* out, const uint32_t* in, unsigned int num_points)
{
- const uint32_t *in_ptr = in;
- uint32_t *out_ptr = out;
- const uint8_t *in8;
- uint8_t *out8;
- unsigned int number = 0;
- for(; number < num_points; ++number){
- in8 = (const uint8_t*)in_ptr;
- out8 = (uint8_t*)out_ptr;
- out8[3] = (in8[0] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
- out8[2] = (in8[1] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
- out8[1] = (in8[2] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
- out8[0] = (in8[3] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
- ++in_ptr;
- ++out_ptr;
- }
+ const uint32_t* in_ptr = in;
+ uint32_t* out_ptr = out;
+ const uint8_t* in8;
+ uint8_t* out8;
+ unsigned int number = 0;
+ for (; number < num_points; ++number) {
+ in8 = (const uint8_t*)in_ptr;
+ out8 = (uint8_t*)out_ptr;
+ out8[3] = (in8[0] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
+ out8[2] = (in8[1] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
+ out8[1] = (in8[2] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
+ out8[0] = (in8[3] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
+ ++in_ptr;
+ ++out_ptr;
+ }
}
#endif /* LV_HAVE_GENERIC */
-//After lengthy thought and quite a bit of whiteboarding:
+// After lengthy thought and quite a bit of whiteboarding:
#ifdef LV_HAVE_GENERIC
-static inline void volk_32u_reverse_32u_bintree_permute_top_down(uint32_t* out, const uint32_t* in,
- unsigned int num_points)
+static inline void volk_32u_reverse_32u_bintree_permute_top_down(uint32_t* out,
+ const uint32_t* in,
+ unsigned int num_points)
{
- const uint32_t *in_ptr = in;
- uint32_t *out_ptr = out;
- unsigned int number = 0;
- for(; number < num_points; ++number){
- uint32_t tmp = *in_ptr;
- /* permute uint16:
- The idea is to simply shift the lower 16 bit up, and the upper 16 bit down.
- */
- tmp = ( tmp << 16 ) | ( tmp >> 16 );
- /* permute bytes:
- shift up by 1 B first, then only consider even bytes, and OR with the unshifted even bytes
- */
- tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
- /* permute 4bit tuples:
- Same idea, but the "consideration" mask expression becomes unwieldy
- */
- tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) | ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
- /* permute 2bit tuples:
- Here, we collapsed the "consideration" mask to a simple hexmask: 0b0011 =
- 3; we need those every 4b, which coincides with a hex digit!
- */
- tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
- /* permute odd/even:
- 0x01 = 0x1; we need these every 2b, which works out: 0x01 | (0x01 << 2) = 0x05!
- */
- tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
+ const uint32_t* in_ptr = in;
+ uint32_t* out_ptr = out;
+ unsigned int number = 0;
+ for (; number < num_points; ++number) {
+ uint32_t tmp = *in_ptr;
+ /* permute uint16:
+ The idea is to simply shift the lower 16 bit up, and the upper 16 bit down.
+ */
+ tmp = (tmp << 16) | (tmp >> 16);
+ /* permute bytes:
+ shift up by 1 B first, then only consider even bytes, and OR with the unshifted
+ even bytes
+ */
+ tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
+ /* permute 4bit tuples:
+ Same idea, but the "consideration" mask expression becomes unwieldy
+ */
+ tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) |
+ ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
+ /* permute 2bit tuples:
+ Here, we collapsed the "consideration" mask to a simple hexmask: 0b0011 =
+ 3; we need those every 4b, which coincides with a hex digit!
+ */
+ tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
+ /* permute odd/even:
+ 0x01 = 0x1; we need these every 2b, which works out: 0x01 | (0x01 << 2) =
+ 0x05!
+ */
+ tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
- *out_ptr = tmp;
- ++in_ptr;
- ++out_ptr;
- }
+ *out_ptr = tmp;
+ ++in_ptr;
+ ++out_ptr;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_GENERIC
-static inline void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t* out, const uint32_t* in,
- unsigned int num_points)
+static inline void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t* out,
+ const uint32_t* in,
+ unsigned int num_points)
{
- //same stuff as top_down, inverted order (permutation matrices don't care, you know!)
- const uint32_t *in_ptr = in;
- uint32_t *out_ptr = out;
- unsigned int number = 0;
- for(; number < num_points; ++number){
- uint32_t tmp = *in_ptr;
- tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
- tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
- tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) | ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
- tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
- tmp = ( tmp << 16 ) | ( tmp >> 16 );
+ // same stuff as top_down, inverted order (permutation matrices don't care, you know!)
+ const uint32_t* in_ptr = in;
+ uint32_t* out_ptr = out;
+ unsigned int number = 0;
+ for (; number < num_points; ++number) {
+ uint32_t tmp = *in_ptr;
+ tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
+ tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
+ tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) |
+ ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
+ tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
+ tmp = (tmp << 16) | (tmp >> 16);
- *out_ptr = tmp;
- ++in_ptr;
- ++out_ptr;
- }
+ *out_ptr = tmp;
+ ++in_ptr;
+ ++out_ptr;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_NEONV8
#include <arm_neon.h>
-static inline void volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in,
- unsigned int num_points)
-{
- const uint32_t *in_ptr = in;
- uint32_t *out_ptr = out;
+static inline void
+volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in, unsigned int num_points)
+{
+ const uint32_t* in_ptr = in;
+ uint32_t* out_ptr = out;
- const uint8x16_t idx = { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 };
+ const uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
- const unsigned int quarterPoints = num_points/4;
+ const unsigned int quarterPoints = num_points / 4;
unsigned int number = 0;
- for(; number < quarterPoints; ++number){
- __VOLK_PREFETCH(in_ptr+4);
- uint32x4_t x = vld1q_u32(in_ptr);
- uint32x4_t z = vreinterpretq_u32_u8(vqtbl1q_u8(vrbitq_u8(vreinterpretq_u8_u32 (x)),
- idx));
- vst1q_u32 (out_ptr, z);
- in_ptr += 4;
- out_ptr += 4;
+ for (; number < quarterPoints; ++number) {
+ __VOLK_PREFETCH(in_ptr + 4);
+ uint32x4_t x = vld1q_u32(in_ptr);
+ uint32x4_t z =
+ vreinterpretq_u32_u8(vqtbl1q_u8(vrbitq_u8(vreinterpretq_u8_u32(x)), idx));
+ vst1q_u32(out_ptr, z);
+ in_ptr += 4;
+ out_ptr += 4;
}
- number = quarterPoints*4;
- for(; number < num_points; ++number){
- *out_ptr =
- (BitReverseTable256[*in_ptr & 0xff] << 24) |
- (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) |
- (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) |
- (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
- ++in_ptr;
- ++out_ptr;
+ number = quarterPoints * 4;
+ for (; number < num_points; ++number) {
+ *out_ptr = (BitReverseTable256[*in_ptr & 0xff] << 24) |
+ (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) |
+ (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) |
+ (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
+ ++in_ptr;
+ ++out_ptr;
}
}
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-#define DO_RBIT \
- __VOLK_ASM("rbit %[result], %[value]" \
- : [result]"=r" (*out_ptr) \
- : [value] "r" (*in_ptr) \
- : ); \
- in_ptr++; \
- out_ptr++;
+#define DO_RBIT \
+ __VOLK_ASM("rbit %[result], %[value]" \
+ : [result] "=r"(*out_ptr) \
+ : [value] "r"(*in_ptr) \
+ :); \
+ in_ptr++; \
+ out_ptr++;
-static inline void volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in,
- unsigned int num_points)
+static inline void
+volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in, unsigned int num_points)
{
- const uint32_t *in_ptr = in;
- uint32_t *out_ptr = out;
- const unsigned int eighthPoints = num_points/8;
+ const uint32_t* in_ptr = in;
+ uint32_t* out_ptr = out;
+ const unsigned int eighthPoints = num_points / 8;
unsigned int number = 0;
- for(; number < eighthPoints; ++number){
- __VOLK_PREFETCH(in_ptr+8);
- DO_RBIT; DO_RBIT; DO_RBIT; DO_RBIT;
- DO_RBIT; DO_RBIT; DO_RBIT; DO_RBIT;
+ for (; number < eighthPoints; ++number) {
+ __VOLK_PREFETCH(in_ptr + 8);
+ DO_RBIT;
+ DO_RBIT;
+ DO_RBIT;
+ DO_RBIT;
+ DO_RBIT;
+ DO_RBIT;
+ DO_RBIT;
+ DO_RBIT;
}
- number = eighthPoints*8;
- for(; number < num_points; ++number){
+ number = eighthPoints * 8;
+ for (; number < num_points; ++number) {
DO_RBIT;
}
}
#endif /* INCLUDED_volk_32u_reverse_32u_u_H */
-
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_64f_convert_32f(float* outputVector, const double* inputVector, unsigned int num_points)
- * \endcode
+ * void volk_64f_convert_32f(float* outputVector, const double* inputVector, unsigned int
+ * num_points) \endcode
*
* \b Inputs
* \li inputVector: The vector of doubles to convert to floats.
#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void volk_64f_convert_32f_u_avx512f(float* outputVector, const double* inputVector, unsigned int num_points){
- unsigned int number = 0;
+static inline void volk_64f_convert_32f_u_avx512f(float* outputVector,
+ const double* inputVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
- const unsigned int oneSixteenthPoints = num_points / 16;
+ const unsigned int oneSixteenthPoints = num_points / 16;
- const double* inputVectorPtr = (const double*)inputVector;
- float* outputVectorPtr = outputVector;
- __m256 ret1, ret2;
- __m512d inputVal1, inputVal2;
+ const double* inputVectorPtr = (const double*)inputVector;
+ float* outputVectorPtr = outputVector;
+ __m256 ret1, ret2;
+ __m512d inputVal1, inputVal2;
- for(;number < oneSixteenthPoints; number++){
- inputVal1 = _mm512_loadu_pd(inputVectorPtr); inputVectorPtr += 8;
- inputVal2 = _mm512_loadu_pd(inputVectorPtr); inputVectorPtr += 8;
+ for (; number < oneSixteenthPoints; number++) {
+ inputVal1 = _mm512_loadu_pd(inputVectorPtr);
+ inputVectorPtr += 8;
+ inputVal2 = _mm512_loadu_pd(inputVectorPtr);
+ inputVectorPtr += 8;
- ret1 = _mm512_cvtpd_ps(inputVal1);
- ret2 = _mm512_cvtpd_ps(inputVal2);
+ ret1 = _mm512_cvtpd_ps(inputVal1);
+ ret2 = _mm512_cvtpd_ps(inputVal2);
- _mm256_storeu_ps(outputVectorPtr, ret1);
- outputVectorPtr += 8;
+ _mm256_storeu_ps(outputVectorPtr, ret1);
+ outputVectorPtr += 8;
- _mm256_storeu_ps(outputVectorPtr, ret2);
- outputVectorPtr += 8;
- }
+ _mm256_storeu_ps(outputVectorPtr, ret2);
+ outputVectorPtr += 8;
+ }
- number = oneSixteenthPoints * 16;
- for(; number < num_points; number++){
- outputVector[number] = (float)(inputVector[number]);
- }
+ number = oneSixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ outputVector[number] = (float)(inputVector[number]);
+ }
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void volk_64f_convert_32f_u_avx(float* outputVector, const double* inputVector, unsigned int num_points){
- unsigned int number = 0;
+static inline void volk_64f_convert_32f_u_avx(float* outputVector,
+ const double* inputVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
- const unsigned int oneEightPoints = num_points / 8;
+ const unsigned int oneEightPoints = num_points / 8;
- const double* inputVectorPtr = (const double*)inputVector;
- float* outputVectorPtr = outputVector;
- __m128 ret1, ret2;
- __m256d inputVal1, inputVal2;
+ const double* inputVectorPtr = (const double*)inputVector;
+ float* outputVectorPtr = outputVector;
+ __m128 ret1, ret2;
+ __m256d inputVal1, inputVal2;
- for(;number < oneEightPoints; number++){
- inputVal1 = _mm256_loadu_pd(inputVectorPtr); inputVectorPtr += 4;
- inputVal2 = _mm256_loadu_pd(inputVectorPtr); inputVectorPtr += 4;
+ for (; number < oneEightPoints; number++) {
+ inputVal1 = _mm256_loadu_pd(inputVectorPtr);
+ inputVectorPtr += 4;
+ inputVal2 = _mm256_loadu_pd(inputVectorPtr);
+ inputVectorPtr += 4;
- ret1 = _mm256_cvtpd_ps(inputVal1);
- ret2 = _mm256_cvtpd_ps(inputVal2);
+ ret1 = _mm256_cvtpd_ps(inputVal1);
+ ret2 = _mm256_cvtpd_ps(inputVal2);
- _mm_storeu_ps(outputVectorPtr, ret1);
- outputVectorPtr += 4;
+ _mm_storeu_ps(outputVectorPtr, ret1);
+ outputVectorPtr += 4;
- _mm_storeu_ps(outputVectorPtr, ret2);
- outputVectorPtr += 4;
- }
+ _mm_storeu_ps(outputVectorPtr, ret2);
+ outputVectorPtr += 4;
+ }
- number = oneEightPoints * 8;
- for(; number < num_points; number++){
- outputVector[number] = (float)(inputVector[number]);
- }
+ number = oneEightPoints * 8;
+ for (; number < num_points; number++) {
+ outputVector[number] = (float)(inputVector[number]);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
- unsigned int number = 0;
+static inline void volk_64f_convert_32f_u_sse2(float* outputVector,
+ const double* inputVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int quarterPoints = num_points / 4;
- const double* inputVectorPtr = (const double*)inputVector;
- float* outputVectorPtr = outputVector;
- __m128 ret, ret2;
- __m128d inputVal1, inputVal2;
+ const double* inputVectorPtr = (const double*)inputVector;
+ float* outputVectorPtr = outputVector;
+ __m128 ret, ret2;
+ __m128d inputVal1, inputVal2;
- for(;number < quarterPoints; number++){
- inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
- inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
+ for (; number < quarterPoints; number++) {
+ inputVal1 = _mm_loadu_pd(inputVectorPtr);
+ inputVectorPtr += 2;
+ inputVal2 = _mm_loadu_pd(inputVectorPtr);
+ inputVectorPtr += 2;
- ret = _mm_cvtpd_ps(inputVal1);
- ret2 = _mm_cvtpd_ps(inputVal2);
+ ret = _mm_cvtpd_ps(inputVal1);
+ ret2 = _mm_cvtpd_ps(inputVal2);
- ret = _mm_movelh_ps(ret, ret2);
+ ret = _mm_movelh_ps(ret, ret2);
- _mm_storeu_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
- }
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+ }
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] = (float)(inputVector[number]);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ outputVector[number] = (float)(inputVector[number]);
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
-static inline void volk_64f_convert_32f_generic(float* outputVector, const double* inputVector, unsigned int num_points){
- float* outputVectorPtr = outputVector;
- const double* inputVectorPtr = inputVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((float)(*inputVectorPtr++));
- }
+static inline void volk_64f_convert_32f_generic(float* outputVector,
+ const double* inputVector,
+ unsigned int num_points)
+{
+ float* outputVectorPtr = outputVector;
+ const double* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++));
+ }
}
#endif /* LV_HAVE_GENERIC */
-
-
#endif /* INCLUDED_volk_64f_convert_32f_u_H */
#ifndef INCLUDED_volk_64f_convert_32f_a_H
#define INCLUDED_volk_64f_convert_32f_a_H
#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void volk_64f_convert_32f_a_avx512f(float* outputVector, const double* inputVector, unsigned int num_points){
- unsigned int number = 0;
+static inline void volk_64f_convert_32f_a_avx512f(float* outputVector,
+ const double* inputVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
- const unsigned int oneSixteenthPoints = num_points / 16;
+ const unsigned int oneSixteenthPoints = num_points / 16;
- const double* inputVectorPtr = (const double*)inputVector;
- float* outputVectorPtr = outputVector;
- __m256 ret1, ret2;
- __m512d inputVal1, inputVal2;
+ const double* inputVectorPtr = (const double*)inputVector;
+ float* outputVectorPtr = outputVector;
+ __m256 ret1, ret2;
+ __m512d inputVal1, inputVal2;
- for(;number < oneSixteenthPoints; number++){
- inputVal1 = _mm512_load_pd(inputVectorPtr); inputVectorPtr += 8;
- inputVal2 = _mm512_load_pd(inputVectorPtr); inputVectorPtr += 8;
+ for (; number < oneSixteenthPoints; number++) {
+ inputVal1 = _mm512_load_pd(inputVectorPtr);
+ inputVectorPtr += 8;
+ inputVal2 = _mm512_load_pd(inputVectorPtr);
+ inputVectorPtr += 8;
- ret1 = _mm512_cvtpd_ps(inputVal1);
- ret2 = _mm512_cvtpd_ps(inputVal2);
+ ret1 = _mm512_cvtpd_ps(inputVal1);
+ ret2 = _mm512_cvtpd_ps(inputVal2);
- _mm256_store_ps(outputVectorPtr, ret1);
- outputVectorPtr += 8;
+ _mm256_store_ps(outputVectorPtr, ret1);
+ outputVectorPtr += 8;
- _mm256_store_ps(outputVectorPtr, ret2);
- outputVectorPtr += 8;
- }
+ _mm256_store_ps(outputVectorPtr, ret2);
+ outputVectorPtr += 8;
+ }
- number = oneSixteenthPoints * 16;
- for(; number < num_points; number++){
- outputVector[number] = (float)(inputVector[number]);
- }
+ number = oneSixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ outputVector[number] = (float)(inputVector[number]);
+ }
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void volk_64f_convert_32f_a_avx(float* outputVector, const double* inputVector, unsigned int num_points){
- unsigned int number = 0;
+static inline void volk_64f_convert_32f_a_avx(float* outputVector,
+ const double* inputVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
- const unsigned int oneEightPoints = num_points / 8;
+ const unsigned int oneEightPoints = num_points / 8;
- const double* inputVectorPtr = (const double*)inputVector;
- float* outputVectorPtr = outputVector;
- __m128 ret1, ret2;
- __m256d inputVal1, inputVal2;
+ const double* inputVectorPtr = (const double*)inputVector;
+ float* outputVectorPtr = outputVector;
+ __m128 ret1, ret2;
+ __m256d inputVal1, inputVal2;
- for(;number < oneEightPoints; number++){
- inputVal1 = _mm256_load_pd(inputVectorPtr); inputVectorPtr += 4;
- inputVal2 = _mm256_load_pd(inputVectorPtr); inputVectorPtr += 4;
+ for (; number < oneEightPoints; number++) {
+ inputVal1 = _mm256_load_pd(inputVectorPtr);
+ inputVectorPtr += 4;
+ inputVal2 = _mm256_load_pd(inputVectorPtr);
+ inputVectorPtr += 4;
- ret1 = _mm256_cvtpd_ps(inputVal1);
- ret2 = _mm256_cvtpd_ps(inputVal2);
+ ret1 = _mm256_cvtpd_ps(inputVal1);
+ ret2 = _mm256_cvtpd_ps(inputVal2);
- _mm_store_ps(outputVectorPtr, ret1);
- outputVectorPtr += 4;
+ _mm_store_ps(outputVectorPtr, ret1);
+ outputVectorPtr += 4;
- _mm_store_ps(outputVectorPtr, ret2);
- outputVectorPtr += 4;
- }
+ _mm_store_ps(outputVectorPtr, ret2);
+ outputVectorPtr += 4;
+ }
- number = oneEightPoints * 8;
- for(; number < num_points; number++){
- outputVector[number] = (float)(inputVector[number]);
- }
+ number = oneEightPoints * 8;
+ for (; number < num_points; number++) {
+ outputVector[number] = (float)(inputVector[number]);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
- unsigned int number = 0;
+static inline void volk_64f_convert_32f_a_sse2(float* outputVector,
+ const double* inputVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int quarterPoints = num_points / 4;
- const double* inputVectorPtr = (const double*)inputVector;
- float* outputVectorPtr = outputVector;
- __m128 ret, ret2;
- __m128d inputVal1, inputVal2;
+ const double* inputVectorPtr = (const double*)inputVector;
+ float* outputVectorPtr = outputVector;
+ __m128 ret, ret2;
+ __m128d inputVal1, inputVal2;
- for(;number < quarterPoints; number++){
- inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
- inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
+ for (; number < quarterPoints; number++) {
+ inputVal1 = _mm_load_pd(inputVectorPtr);
+ inputVectorPtr += 2;
+ inputVal2 = _mm_load_pd(inputVectorPtr);
+ inputVectorPtr += 2;
- ret = _mm_cvtpd_ps(inputVal1);
- ret2 = _mm_cvtpd_ps(inputVal2);
+ ret = _mm_cvtpd_ps(inputVal1);
+ ret2 = _mm_cvtpd_ps(inputVal2);
- ret = _mm_movelh_ps(ret, ret2);
+ ret = _mm_movelh_ps(ret, ret2);
- _mm_store_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
- }
+ _mm_store_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+ }
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] = (float)(inputVector[number]);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ outputVector[number] = (float)(inputVector[number]);
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
-static inline void volk_64f_convert_32f_a_generic(float* outputVector, const double* inputVector, unsigned int num_points){
- float* outputVectorPtr = outputVector;
- const double* inputVectorPtr = inputVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((float)(*inputVectorPtr++));
- }
+static inline void volk_64f_convert_32f_a_generic(float* outputVector,
+ const double* inputVector,
+ unsigned int num_points)
+{
+ float* outputVectorPtr = outputVector;
+ const double* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++));
+ }
}
#endif /* LV_HAVE_GENERIC */
-
-
#endif /* INCLUDED_volk_64f_convert_32f_a_H */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_64f_x2_add_64f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
- * \endcode
+ * void volk_64f_x2_add_64f(float* cVector, const float* aVector, const float* bVector,
+ * unsigned int num_points) \endcode
*
* \b Inputs
* \li aVector: First input vector.
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_64f_x2_add_64f_generic(double *cVector, const double *aVector,
- const double *bVector, unsigned int num_points)
+static inline void volk_64f_x2_add_64f_generic(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- double *cPtr = cVector;
- const double *aPtr = aVector;
- const double *bPtr = bVector;
- unsigned int number = 0;
-
- for (number = 0; number < num_points; number++) {
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#include <emmintrin.h>
-static inline void
-volk_64f_x2_add_64f_u_sse2(double *cVector, const double *aVector,
- const double *bVector, unsigned int num_points)
+static inline void volk_64f_x2_add_64f_u_sse2(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int half_points = num_points / 2;
+ unsigned int number = 0;
+ const unsigned int half_points = num_points / 2;
- double *cPtr = cVector;
- const double *aPtr = aVector;
- const double *bPtr = bVector;
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr = bVector;
- __m128d aVal, bVal, cVal;
- for (; number < half_points; number++) {
- aVal = _mm_loadu_pd(aPtr);
- bVal = _mm_loadu_pd(bPtr);
+ __m128d aVal, bVal, cVal;
+ for (; number < half_points; number++) {
+ aVal = _mm_loadu_pd(aPtr);
+ bVal = _mm_loadu_pd(bPtr);
- cVal = _mm_add_pd(aVal, bVal);
+ cVal = _mm_add_pd(aVal, bVal);
- _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container
+ _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container
- aPtr += 2;
- bPtr += 2;
- cPtr += 2;
- }
+ aPtr += 2;
+ bPtr += 2;
+ cPtr += 2;
+ }
- number = half_points * 2;
- for (; number < num_points; number++) {
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
+ number = half_points * 2;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_SSE2 */
#include <immintrin.h>
-static inline void
-volk_64f_x2_add_64f_u_avx(double *cVector, const double *aVector,
- const double *bVector, unsigned int num_points)
+static inline void volk_64f_x2_add_64f_u_avx(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarter_points = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarter_points = num_points / 4;
- double *cPtr = cVector;
- const double *aPtr = aVector;
- const double *bPtr = bVector;
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr = bVector;
- __m256d aVal, bVal, cVal;
- for (; number < quarter_points; number++) {
+ __m256d aVal, bVal, cVal;
+ for (; number < quarter_points; number++) {
- aVal = _mm256_loadu_pd(aPtr);
- bVal = _mm256_loadu_pd(bPtr);
+ aVal = _mm256_loadu_pd(aPtr);
+ bVal = _mm256_loadu_pd(bPtr);
- cVal = _mm256_add_pd(aVal, bVal);
+ cVal = _mm256_add_pd(aVal, bVal);
- _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
+ _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
- number = quarter_points * 4;
- for (; number < num_points; number++) {
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
+ number = quarter_points * 4;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
#include <emmintrin.h>
-static inline void
-volk_64f_x2_add_64f_a_sse2(double *cVector, const double *aVector,
- const double *bVector, unsigned int num_points)
+static inline void volk_64f_x2_add_64f_a_sse2(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int half_points = num_points / 2;
+ unsigned int number = 0;
+ const unsigned int half_points = num_points / 2;
- double *cPtr = cVector;
- const double *aPtr = aVector;
- const double *bPtr = bVector;
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr = bVector;
- __m128d aVal, bVal, cVal;
- for (; number < half_points; number++) {
- aVal = _mm_load_pd(aPtr);
- bVal = _mm_load_pd(bPtr);
+ __m128d aVal, bVal, cVal;
+ for (; number < half_points; number++) {
+ aVal = _mm_load_pd(aPtr);
+ bVal = _mm_load_pd(bPtr);
- cVal = _mm_add_pd(aVal, bVal);
+ cVal = _mm_add_pd(aVal, bVal);
- _mm_store_pd(cPtr, cVal); // Store the results back into the C container
+ _mm_store_pd(cPtr, cVal); // Store the results back into the C container
- aPtr += 2;
- bPtr += 2;
- cPtr += 2;
- }
+ aPtr += 2;
+ bPtr += 2;
+ cPtr += 2;
+ }
- number = half_points * 2;
- for (; number < num_points; number++) {
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
+ number = half_points * 2;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_SSE2 */
#include <immintrin.h>
-static inline void
-volk_64f_x2_add_64f_a_avx(double *cVector, const double *aVector,
- const double *bVector, unsigned int num_points)
+static inline void volk_64f_x2_add_64f_a_avx(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarter_points = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarter_points = num_points / 4;
- double *cPtr = cVector;
- const double *aPtr = aVector;
- const double *bPtr = bVector;
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr = bVector;
- __m256d aVal, bVal, cVal;
- for (; number < quarter_points; number++) {
+ __m256d aVal, bVal, cVal;
+ for (; number < quarter_points; number++) {
- aVal = _mm256_load_pd(aPtr);
- bVal = _mm256_load_pd(bPtr);
+ aVal = _mm256_load_pd(aPtr);
+ bVal = _mm256_load_pd(bPtr);
- cVal = _mm256_add_pd(aVal, bVal);
+ cVal = _mm256_add_pd(aVal, bVal);
- _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
+ _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
- number = quarter_points * 4;
- for (; number < num_points; number++) {
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
+ number = quarter_points * 4;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_64f_x2_max_64f(double* cVector, const double* aVector, const double* bVector, unsigned int num_points)
- * \endcode
+ * void volk_64f_x2_max_64f(double* cVector, const double* aVector, const double* bVector,
+ * unsigned int num_points) \endcode
*
* \b Inputs
* \li aVector: First input vector.
#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void
-volk_64f_x2_max_64f_a_avx512f(double* cVector, const double* aVector,
- const double* bVector, unsigned int num_points)
+static inline void volk_64f_x2_max_64f_a_avx512f(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eigthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eigthPoints = num_points / 8;
- double* cPtr = cVector;
- const double* aPtr = aVector;
- const double* bPtr= bVector;
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr = bVector;
- __m512d aVal, bVal, cVal;
- for(;number < eigthPoints; number++){
+ __m512d aVal, bVal, cVal;
+ for (; number < eigthPoints; number++) {
- aVal = _mm512_load_pd(aPtr);
- bVal = _mm512_load_pd(bPtr);
+ aVal = _mm512_load_pd(aPtr);
+ bVal = _mm512_load_pd(bPtr);
- cVal = _mm512_max_pd(aVal, bVal);
+ cVal = _mm512_max_pd(aVal, bVal);
- _mm512_store_pd(cPtr,cVal); // Store the results back into the C container
+ _mm512_store_pd(cPtr, cVal); // Store the results back into the C container
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
- number = eigthPoints * 8;
- for(;number < num_points; number++){
- const double a = *aPtr++;
- const double b = *bPtr++;
- *cPtr++ = ( a > b ? a : b);
- }
+ number = eigthPoints * 8;
+ for (; number < num_points; number++) {
+ const double a = *aPtr++;
+ const double b = *bPtr++;
+ *cPtr++ = (a > b ? a : b);
+ }
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_64f_x2_max_64f_a_avx(double* cVector, const double* aVector,
- const double* bVector, unsigned int num_points)
+static inline void volk_64f_x2_max_64f_a_avx(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- double* cPtr = cVector;
- const double* aPtr = aVector;
- const double* bPtr= bVector;
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr = bVector;
- __m256d aVal, bVal, cVal;
- for(;number < quarterPoints; number++){
+ __m256d aVal, bVal, cVal;
+ for (; number < quarterPoints; number++) {
- aVal = _mm256_load_pd(aPtr);
- bVal = _mm256_load_pd(bPtr);
+ aVal = _mm256_load_pd(aPtr);
+ bVal = _mm256_load_pd(bPtr);
- cVal = _mm256_max_pd(aVal, bVal);
+ cVal = _mm256_max_pd(aVal, bVal);
- _mm256_store_pd(cPtr,cVal); // Store the results back into the C container
+ _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- const double a = *aPtr++;
- const double b = *bPtr++;
- *cPtr++ = ( a > b ? a : b);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ const double a = *aPtr++;
+ const double b = *bPtr++;
+ *cPtr++ = (a > b ? a : b);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void
-volk_64f_x2_max_64f_a_sse2(double* cVector, const double* aVector,
- const double* bVector, unsigned int num_points)
+static inline void volk_64f_x2_max_64f_a_sse2(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int halfPoints = num_points / 2;
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
- double* cPtr = cVector;
- const double* aPtr = aVector;
- const double* bPtr= bVector;
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr = bVector;
- __m128d aVal, bVal, cVal;
- for(;number < halfPoints; number++){
+ __m128d aVal, bVal, cVal;
+ for (; number < halfPoints; number++) {
- aVal = _mm_load_pd(aPtr);
- bVal = _mm_load_pd(bPtr);
+ aVal = _mm_load_pd(aPtr);
+ bVal = _mm_load_pd(bPtr);
- cVal = _mm_max_pd(aVal, bVal);
+ cVal = _mm_max_pd(aVal, bVal);
- _mm_store_pd(cPtr,cVal); // Store the results back into the C container
+ _mm_store_pd(cPtr, cVal); // Store the results back into the C container
- aPtr += 2;
- bPtr += 2;
- cPtr += 2;
- }
+ aPtr += 2;
+ bPtr += 2;
+ cPtr += 2;
+ }
- number = halfPoints * 2;
- for(;number < num_points; number++){
- const double a = *aPtr++;
- const double b = *bPtr++;
- *cPtr++ = ( a > b ? a : b);
- }
+ number = halfPoints * 2;
+ for (; number < num_points; number++) {
+ const double a = *aPtr++;
+ const double b = *bPtr++;
+ *cPtr++ = (a > b ? a : b);
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_64f_x2_max_64f_generic(double* cVector, const double* aVector,
- const double* bVector, unsigned int num_points)
+static inline void volk_64f_x2_max_64f_generic(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- double* cPtr = cVector;
- const double* aPtr = aVector;
- const double* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- const double a = *aPtr++;
- const double b = *bPtr++;
- *cPtr++ = ( a > b ? a : b);
- }
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ const double a = *aPtr++;
+ const double b = *bPtr++;
+ *cPtr++ = (a > b ? a : b);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void
-volk_64f_x2_max_64f_u_avx512f(double* cVector, const double* aVector,
- const double* bVector, unsigned int num_points)
+static inline void volk_64f_x2_max_64f_u_avx512f(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eigthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eigthPoints = num_points / 8;
- double* cPtr = cVector;
- const double* aPtr = aVector;
- const double* bPtr= bVector;
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr = bVector;
- __m512d aVal, bVal, cVal;
- for(;number < eigthPoints; number++){
+ __m512d aVal, bVal, cVal;
+ for (; number < eigthPoints; number++) {
- aVal = _mm512_loadu_pd(aPtr);
- bVal = _mm512_loadu_pd(bPtr);
+ aVal = _mm512_loadu_pd(aPtr);
+ bVal = _mm512_loadu_pd(bPtr);
- cVal = _mm512_max_pd(aVal, bVal);
+ cVal = _mm512_max_pd(aVal, bVal);
- _mm512_storeu_pd(cPtr,cVal); // Store the results back into the C container
+ _mm512_storeu_pd(cPtr, cVal); // Store the results back into the C container
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
- number = eigthPoints * 8;
- for(;number < num_points; number++){
- const double a = *aPtr++;
- const double b = *bPtr++;
- *cPtr++ = ( a > b ? a : b);
- }
+ number = eigthPoints * 8;
+ for (; number < num_points; number++) {
+ const double a = *aPtr++;
+ const double b = *bPtr++;
+ *cPtr++ = (a > b ? a : b);
+ }
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_64f_x2_max_64f_u_avx(double* cVector, const double* aVector,
- const double* bVector, unsigned int num_points)
+static inline void volk_64f_x2_max_64f_u_avx(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- double* cPtr = cVector;
- const double* aPtr = aVector;
- const double* bPtr= bVector;
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr = bVector;
- __m256d aVal, bVal, cVal;
- for(;number < quarterPoints; number++){
+ __m256d aVal, bVal, cVal;
+ for (; number < quarterPoints; number++) {
- aVal = _mm256_loadu_pd(aPtr);
- bVal = _mm256_loadu_pd(bPtr);
+ aVal = _mm256_loadu_pd(aPtr);
+ bVal = _mm256_loadu_pd(bPtr);
- cVal = _mm256_max_pd(aVal, bVal);
+ cVal = _mm256_max_pd(aVal, bVal);
- _mm256_storeu_pd(cPtr,cVal); // Store the results back into the C container
+ _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- const double a = *aPtr++;
- const double b = *bPtr++;
- *cPtr++ = ( a > b ? a : b);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ const double a = *aPtr++;
+ const double b = *bPtr++;
+ *cPtr++ = (a > b ? a : b);
+ }
}
#endif /* LV_HAVE_AVX */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_64f_x2_min_64f(double* cVector, const double* aVector, const double* bVector, unsigned int num_points)
+ * void volk_64f_x2_min_64f(double* cVector, const double* aVector, const double* bVector,
+ unsigned int num_points)
* \endcode
*
* \b Inputs
#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void
-volk_64f_x2_min_64f_a_avx512f(double* cVector, const double* aVector,
- const double* bVector, unsigned int num_points)
+static inline void volk_64f_x2_min_64f_a_avx512f(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eigthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eigthPoints = num_points / 8;
- double* cPtr = cVector;
- const double* aPtr = aVector;
- const double* bPtr= bVector;
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr = bVector;
- __m512d aVal, bVal, cVal;
- for(;number < eigthPoints; number++){
+ __m512d aVal, bVal, cVal;
+ for (; number < eigthPoints; number++) {
- aVal = _mm512_load_pd(aPtr);
- bVal = _mm512_load_pd(bPtr);
+ aVal = _mm512_load_pd(aPtr);
+ bVal = _mm512_load_pd(bPtr);
- cVal = _mm512_min_pd(aVal, bVal);
+ cVal = _mm512_min_pd(aVal, bVal);
- _mm512_store_pd(cPtr,cVal); // Store the results back into the C container
+ _mm512_store_pd(cPtr, cVal); // Store the results back into the C container
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
- number = eigthPoints * 8;
- for(;number < num_points; number++){
- const double a = *aPtr++;
- const double b = *bPtr++;
- *cPtr++ = ( a < b ? a : b);
- }
+ number = eigthPoints * 8;
+ for (; number < num_points; number++) {
+ const double a = *aPtr++;
+ const double b = *bPtr++;
+ *cPtr++ = (a < b ? a : b);
+ }
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_64f_x2_min_64f_a_avx(double* cVector, const double* aVector,
- const double* bVector, unsigned int num_points)
+static inline void volk_64f_x2_min_64f_a_avx(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- double* cPtr = cVector;
- const double* aPtr = aVector;
- const double* bPtr= bVector;
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr = bVector;
- __m256d aVal, bVal, cVal;
- for(;number < quarterPoints; number++){
+ __m256d aVal, bVal, cVal;
+ for (; number < quarterPoints; number++) {
- aVal = _mm256_load_pd(aPtr);
- bVal = _mm256_load_pd(bPtr);
+ aVal = _mm256_load_pd(aPtr);
+ bVal = _mm256_load_pd(bPtr);
- cVal = _mm256_min_pd(aVal, bVal);
+ cVal = _mm256_min_pd(aVal, bVal);
- _mm256_store_pd(cPtr,cVal); // Store the results back into the C container
+ _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- const double a = *aPtr++;
- const double b = *bPtr++;
- *cPtr++ = ( a < b ? a : b);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ const double a = *aPtr++;
+ const double b = *bPtr++;
+ *cPtr++ = (a < b ? a : b);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void
-volk_64f_x2_min_64f_a_sse2(double* cVector, const double* aVector,
- const double* bVector, unsigned int num_points)
+static inline void volk_64f_x2_min_64f_a_sse2(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int halfPoints = num_points / 2;
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
- double* cPtr = cVector;
- const double* aPtr = aVector;
- const double* bPtr= bVector;
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr = bVector;
- __m128d aVal, bVal, cVal;
- for(;number < halfPoints; number++){
+ __m128d aVal, bVal, cVal;
+ for (; number < halfPoints; number++) {
- aVal = _mm_load_pd(aPtr);
- bVal = _mm_load_pd(bPtr);
+ aVal = _mm_load_pd(aPtr);
+ bVal = _mm_load_pd(bPtr);
- cVal = _mm_min_pd(aVal, bVal);
+ cVal = _mm_min_pd(aVal, bVal);
- _mm_store_pd(cPtr,cVal); // Store the results back into the C container
+ _mm_store_pd(cPtr, cVal); // Store the results back into the C container
- aPtr += 2;
- bPtr += 2;
- cPtr += 2;
- }
+ aPtr += 2;
+ bPtr += 2;
+ cPtr += 2;
+ }
- number = halfPoints * 2;
- for(;number < num_points; number++){
- const double a = *aPtr++;
- const double b = *bPtr++;
- *cPtr++ = ( a < b ? a : b);
- }
+ number = halfPoints * 2;
+ for (; number < num_points; number++) {
+ const double a = *aPtr++;
+ const double b = *bPtr++;
+ *cPtr++ = (a < b ? a : b);
+ }
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_64f_x2_min_64f_generic(double* cVector, const double* aVector,
- const double* bVector, unsigned int num_points)
+static inline void volk_64f_x2_min_64f_generic(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- double* cPtr = cVector;
- const double* aPtr = aVector;
- const double* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- const double a = *aPtr++;
- const double b = *bPtr++;
- *cPtr++ = ( a < b ? a : b);
- }
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ const double a = *aPtr++;
+ const double b = *bPtr++;
+ *cPtr++ = (a < b ? a : b);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
-static inline void
-volk_64f_x2_min_64f_u_avx512f(double* cVector, const double* aVector,
- const double* bVector, unsigned int num_points)
+static inline void volk_64f_x2_min_64f_u_avx512f(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int eigthPoints = num_points / 8;
+ unsigned int number = 0;
+ const unsigned int eigthPoints = num_points / 8;
- double* cPtr = cVector;
- const double* aPtr = aVector;
- const double* bPtr= bVector;
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr = bVector;
- __m512d aVal, bVal, cVal;
- for(;number < eigthPoints; number++){
+ __m512d aVal, bVal, cVal;
+ for (; number < eigthPoints; number++) {
- aVal = _mm512_loadu_pd(aPtr);
- bVal = _mm512_loadu_pd(bPtr);
+ aVal = _mm512_loadu_pd(aPtr);
+ bVal = _mm512_loadu_pd(bPtr);
- cVal = _mm512_min_pd(aVal, bVal);
+ cVal = _mm512_min_pd(aVal, bVal);
- _mm512_storeu_pd(cPtr,cVal); // Store the results back into the C container
+ _mm512_storeu_pd(cPtr, cVal); // Store the results back into the C container
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
- number = eigthPoints * 8;
- for(;number < num_points; number++){
- const double a = *aPtr++;
- const double b = *bPtr++;
- *cPtr++ = ( a < b ? a : b);
- }
+ number = eigthPoints * 8;
+ for (; number < num_points; number++) {
+ const double a = *aPtr++;
+ const double b = *bPtr++;
+ *cPtr++ = (a < b ? a : b);
+ }
}
#endif /* LV_HAVE_AVX512F */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_64f_x2_min_64f_u_avx(double* cVector, const double* aVector,
- const double* bVector, unsigned int num_points)
+static inline void volk_64f_x2_min_64f_u_avx(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- double* cPtr = cVector;
- const double* aPtr = aVector;
- const double* bPtr= bVector;
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr = bVector;
- __m256d aVal, bVal, cVal;
- for(;number < quarterPoints; number++){
+ __m256d aVal, bVal, cVal;
+ for (; number < quarterPoints; number++) {
- aVal = _mm256_loadu_pd(aPtr);
- bVal = _mm256_loadu_pd(bPtr);
+ aVal = _mm256_loadu_pd(aPtr);
+ bVal = _mm256_loadu_pd(bPtr);
- cVal = _mm256_min_pd(aVal, bVal);
+ cVal = _mm256_min_pd(aVal, bVal);
- _mm256_storeu_pd(cPtr,cVal); // Store the results back into the C container
+ _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- const double a = *aPtr++;
- const double b = *bPtr++;
- *cPtr++ = ( a < b ? a : b);
- }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ const double a = *aPtr++;
+ const double b = *bPtr++;
+ *cPtr++ = (a < b ? a : b);
+ }
}
#endif /* LV_HAVE_AVX */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_64f_x2_multiply_64f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
- * \endcode
+ * void volk_64f_x2_multiply_64f(float* cVector, const float* aVector, const float*
+ * bVector, unsigned int num_points) \endcode
*
* \b Inputs
* \li aVector: First input vector.
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_64f_x2_multiply_64f_generic(double *cVector, const double *aVector,
- const double *bVector, unsigned int num_points)
+static inline void volk_64f_x2_multiply_64f_generic(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- double *cPtr = cVector;
- const double *aPtr = aVector;
- const double *bPtr = bVector;
- unsigned int number = 0;
-
- for (number = 0; number < num_points; number++) {
- *cPtr++ = (*aPtr++) * (*bPtr++);
- }
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr = bVector;
+ unsigned int number = 0;
+
+ for (number = 0; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
#include <emmintrin.h>
-static inline void
-volk_64f_x2_multiply_64f_u_sse2(double *cVector, const double *aVector,
- const double *bVector, unsigned int num_points)
+static inline void volk_64f_x2_multiply_64f_u_sse2(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int half_points = num_points / 2;
+ unsigned int number = 0;
+ const unsigned int half_points = num_points / 2;
- double *cPtr = cVector;
- const double *aPtr = aVector;
- const double *bPtr = bVector;
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr = bVector;
- __m128d aVal, bVal, cVal;
- for (; number < half_points; number++) {
- aVal = _mm_loadu_pd(aPtr);
- bVal = _mm_loadu_pd(bPtr);
+ __m128d aVal, bVal, cVal;
+ for (; number < half_points; number++) {
+ aVal = _mm_loadu_pd(aPtr);
+ bVal = _mm_loadu_pd(bPtr);
- cVal = _mm_mul_pd(aVal, bVal);
+ cVal = _mm_mul_pd(aVal, bVal);
- _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container
+ _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container
- aPtr += 2;
- bPtr += 2;
- cPtr += 2;
- }
+ aPtr += 2;
+ bPtr += 2;
+ cPtr += 2;
+ }
- number = half_points * 2;
- for (; number < num_points; number++) {
- *cPtr++ = (*aPtr++) * (*bPtr++);
- }
+ number = half_points * 2;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
}
#endif /* LV_HAVE_SSE2 */
#include <immintrin.h>
-static inline void
-volk_64f_x2_multiply_64f_u_avx(double *cVector, const double *aVector,
- const double *bVector, unsigned int num_points)
+static inline void volk_64f_x2_multiply_64f_u_avx(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarter_points = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarter_points = num_points / 4;
- double *cPtr = cVector;
- const double *aPtr = aVector;
- const double *bPtr = bVector;
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr = bVector;
- __m256d aVal, bVal, cVal;
- for (; number < quarter_points; number++) {
+ __m256d aVal, bVal, cVal;
+ for (; number < quarter_points; number++) {
- aVal = _mm256_loadu_pd(aPtr);
- bVal = _mm256_loadu_pd(bPtr);
+ aVal = _mm256_loadu_pd(aPtr);
+ bVal = _mm256_loadu_pd(bPtr);
- cVal = _mm256_mul_pd(aVal, bVal);
+ cVal = _mm256_mul_pd(aVal, bVal);
- _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
+ _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
- number = quarter_points * 4;
- for (; number < num_points; number++) {
- *cPtr++ = (*aPtr++) * (*bPtr++);
- }
+ number = quarter_points * 4;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
#include <emmintrin.h>
-static inline void
-volk_64f_x2_multiply_64f_a_sse2(double *cVector, const double *aVector,
- const double *bVector, unsigned int num_points)
+static inline void volk_64f_x2_multiply_64f_a_sse2(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int half_points = num_points / 2;
+ unsigned int number = 0;
+ const unsigned int half_points = num_points / 2;
- double *cPtr = cVector;
- const double *aPtr = aVector;
- const double *bPtr = bVector;
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr = bVector;
- __m128d aVal, bVal, cVal;
- for (; number < half_points; number++) {
- aVal = _mm_load_pd(aPtr);
- bVal = _mm_load_pd(bPtr);
+ __m128d aVal, bVal, cVal;
+ for (; number < half_points; number++) {
+ aVal = _mm_load_pd(aPtr);
+ bVal = _mm_load_pd(bPtr);
- cVal = _mm_mul_pd(aVal, bVal);
+ cVal = _mm_mul_pd(aVal, bVal);
- _mm_store_pd(cPtr, cVal); // Store the results back into the C container
+ _mm_store_pd(cPtr, cVal); // Store the results back into the C container
- aPtr += 2;
- bPtr += 2;
- cPtr += 2;
- }
+ aPtr += 2;
+ bPtr += 2;
+ cPtr += 2;
+ }
- number = half_points * 2;
- for (; number < num_points; number++) {
- *cPtr++ = (*aPtr++) * (*bPtr++);
- }
+ number = half_points * 2;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
}
#endif /* LV_HAVE_SSE2 */
#include <immintrin.h>
-static inline void
-volk_64f_x2_multiply_64f_a_avx(double *cVector, const double *aVector,
- const double *bVector, unsigned int num_points)
+static inline void volk_64f_x2_multiply_64f_a_avx(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarter_points = num_points / 4;
+ unsigned int number = 0;
+ const unsigned int quarter_points = num_points / 4;
- double *cPtr = cVector;
- const double *aPtr = aVector;
- const double *bPtr = bVector;
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr = bVector;
- __m256d aVal, bVal, cVal;
- for (; number < quarter_points; number++) {
+ __m256d aVal, bVal, cVal;
+ for (; number < quarter_points; number++) {
- aVal = _mm256_load_pd(aPtr);
- bVal = _mm256_load_pd(bPtr);
+ aVal = _mm256_load_pd(aPtr);
+ bVal = _mm256_load_pd(bPtr);
- cVal = _mm256_mul_pd(aVal, bVal);
+ cVal = _mm256_mul_pd(aVal, bVal);
- _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
+ _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
- number = quarter_points * 4;
- for (; number < num_points; number++) {
- *cPtr++ = (*aPtr++) * (*bPtr++);
- }
+ number = quarter_points * 4;
+ for (; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points){
+static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points)
+{
uint32_t* inputPtr = (uint32_t*)intsToSwap;
__m128i input, byte1, byte2, byte3, byte4, output;
__m128i byte2mask = _mm_set1_epi32(0x00FF0000);
__m128i byte3mask = _mm_set1_epi32(0x0000FF00);
uint64_t number = 0;
const unsigned int halfPoints = num_points / 2;
- for(;number < halfPoints; number++){
- // Load the 32t values, increment inputPtr later since we're doing it in-place.
- input = _mm_loadu_si128((__m128i*)inputPtr);
-
- // Do the four shifts
- byte1 = _mm_slli_epi32(input, 24);
- byte2 = _mm_slli_epi32(input, 8);
- byte3 = _mm_srli_epi32(input, 8);
- byte4 = _mm_srli_epi32(input, 24);
- // Or bytes together
- output = _mm_or_si128(byte1, byte4);
- byte2 = _mm_and_si128(byte2, byte2mask);
- output = _mm_or_si128(output, byte2);
- byte3 = _mm_and_si128(byte3, byte3mask);
- output = _mm_or_si128(output, byte3);
-
- // Reorder the two words
- output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
-
- // Store the results
- _mm_storeu_si128((__m128i*)inputPtr, output);
- inputPtr += 4;
+ for (; number < halfPoints; number++) {
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_loadu_si128((__m128i*)inputPtr);
+
+ // Do the four shifts
+ byte1 = _mm_slli_epi32(input, 24);
+ byte2 = _mm_slli_epi32(input, 8);
+ byte3 = _mm_srli_epi32(input, 8);
+ byte4 = _mm_srli_epi32(input, 24);
+ // Or bytes together
+ output = _mm_or_si128(byte1, byte4);
+ byte2 = _mm_and_si128(byte2, byte2mask);
+ output = _mm_or_si128(output, byte2);
+ byte3 = _mm_and_si128(byte3, byte3mask);
+ output = _mm_or_si128(output, byte3);
+
+ // Reorder the two words
+ output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
+
+ // Store the results
+ _mm_storeu_si128((__m128i*)inputPtr, output);
+ inputPtr += 4;
}
// Byteswap any remaining points:
- number = halfPoints*2;
- for(; number < num_points; number++){
- uint32_t output1 = *inputPtr;
- uint32_t output2 = inputPtr[1];
+ number = halfPoints * 2;
+ for (; number < num_points; number++) {
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
- output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
+ ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
- output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
+ ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
- *inputPtr++ = output2;
- *inputPtr++ = output1;
+ *inputPtr++ = output2;
+ *inputPtr++ = output1;
}
}
#endif /* LV_HAVE_SSE2 */
-
#ifdef LV_HAVE_GENERIC
-static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, unsigned int num_points){
- uint32_t* inputPtr = (uint32_t*)intsToSwap;
- unsigned int point;
- for(point = 0; point < num_points; point++){
- uint32_t output1 = *inputPtr;
- uint32_t output2 = inputPtr[1];
+static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap,
+ unsigned int num_points)
+{
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ unsigned int point;
+ for (point = 0; point < num_points; point++) {
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
- output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
+ ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
- output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
+ ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
- *inputPtr++ = output2;
- *inputPtr++ = output1;
- }
+ *inputPtr++ = output2;
+ *inputPtr++ = output1;
+ }
}
#endif /* LV_HAVE_GENERIC */
#include <immintrin.h>
static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int num_points)
{
- unsigned int number = 0;
-
- const unsigned int nPerSet = 4;
- const uint64_t nSets = num_points / nPerSet;
+ unsigned int number = 0;
- uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ const unsigned int nPerSet = 4;
+ const uint64_t nSets = num_points / nPerSet;
- const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
- const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]);
+ const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
+ 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
+ 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
- for ( ;number < nSets; number++ ) {
+ const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
- // Load the 32t values, increment inputPtr later since we're doing it in-place.
- const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
- const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
+ for (; number < nSets; number++) {
- // Store the results
- _mm256_store_si256((__m256i*)inputPtr, output);
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
+ const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
- /* inputPtr is 32bit so increment twice */
- inputPtr += 2 * nPerSet;
- }
- _mm256_zeroupper();
+ // Store the results
+ _mm256_store_si256((__m256i*)inputPtr, output);
- // Byteswap any remaining points:
- for(number = nSets * nPerSet; number < num_points; ++number ) {
- uint32_t output1 = *inputPtr;
- uint32_t output2 = inputPtr[1];
- uint32_t out1 = ((((output1) >> 24) & 0x000000ff) |
- (((output1) >> 8) & 0x0000ff00) |
- (((output1) << 8) & 0x00ff0000) |
- (((output1) << 24) & 0xff000000) );
+ /* inputPtr is 32bit so increment twice */
+ inputPtr += 2 * nPerSet;
+ }
+ _mm256_zeroupper();
- uint32_t out2 = ((((output2) >> 24) & 0x000000ff) |
- (((output2) >> 8) & 0x0000ff00) |
- (((output2) << 8) & 0x00ff0000) |
- (((output2) << 24) & 0xff000000) );
- *inputPtr++ = out2;
- *inputPtr++ = out1;
- }
+ // Byteswap any remaining points:
+ for (number = nSets * nPerSet; number < num_points; ++number) {
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+ uint32_t out1 =
+ ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
+ (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
+
+ uint32_t out2 =
+ ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
+ (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
+ *inputPtr++ = out2;
+ *inputPtr++ = out1;
+ }
}
#endif /* LV_HAVE_AVX2 */
#if LV_HAVE_SSSE3
#include <tmmintrin.h>
-static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap, unsigned int num_points)
+static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap,
+ unsigned int num_points)
{
- unsigned int number = 0;
+ unsigned int number = 0;
- const unsigned int nPerSet = 2;
- const uint64_t nSets = num_points / nPerSet;
+ const unsigned int nPerSet = 2;
+ const uint64_t nSets = num_points / nPerSet;
- uint32_t* inputPtr = (uint32_t*)intsToSwap;
-
- uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
- const __m128i myShuffle = _mm_loadu_si128((__m128i*) &shuffleVector);
+ uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
- for ( ;number < nSets; number++ ) {
+ const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
- // Load the 32t values, increment inputPtr later since we're doing it in-place.
- const __m128i input = _mm_load_si128((__m128i*)inputPtr);
- const __m128i output = _mm_shuffle_epi8(input,myShuffle);
+ for (; number < nSets; number++) {
- // Store the results
- _mm_store_si128((__m128i*)inputPtr, output);
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ const __m128i input = _mm_load_si128((__m128i*)inputPtr);
+ const __m128i output = _mm_shuffle_epi8(input, myShuffle);
- /* inputPtr is 32bit so increment twice */
- inputPtr += 2 * nPerSet;
- }
+ // Store the results
+ _mm_store_si128((__m128i*)inputPtr, output);
- // Byteswap any remaining points:
- for(number = nSets * nPerSet; number < num_points; ++number ) {
- uint32_t output1 = *inputPtr;
- uint32_t output2 = inputPtr[1];
- uint32_t out1 = ((((output1) >> 24) & 0x000000ff) |
- (((output1) >> 8) & 0x0000ff00) |
- (((output1) << 8) & 0x00ff0000) |
- (((output1) << 24) & 0xff000000) );
+ /* inputPtr is 32bit so increment twice */
+ inputPtr += 2 * nPerSet;
+ }
- uint32_t out2 = ((((output2) >> 24) & 0x000000ff) |
- (((output2) >> 8) & 0x0000ff00) |
- (((output2) << 8) & 0x00ff0000) |
- (((output2) << 24) & 0xff000000) );
- *inputPtr++ = out2;
- *inputPtr++ = out1;
- }
+ // Byteswap any remaining points:
+ for (number = nSets * nPerSet; number < num_points; ++number) {
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+ uint32_t out1 =
+ ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
+ (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
+
+ uint32_t out2 =
+ ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
+ (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
+ *inputPtr++ = out2;
+ *inputPtr++ = out1;
+ }
}
#endif /* LV_HAVE_SSSE3 */
#ifdef LV_HAVE_NEONV8
#include <arm_neon.h>
-static inline void volk_64u_byteswap_neonv8(uint64_t* intsToSwap, unsigned int num_points){
- uint32_t* inputPtr = (uint32_t*)intsToSwap;
- const unsigned int n4points = num_points / 4;
- uint8x16x2_t input;
- uint8x16_t idx = { 7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8 };
-
- unsigned int number = 0;
- for(number = 0; number < n4points; ++number){
- __VOLK_PREFETCH(inputPtr+8);
- input = vld2q_u8((uint8_t*) inputPtr);
- input.val[0] = vqtbl1q_u8(input.val[0], idx);
- input.val[1] = vqtbl1q_u8(input.val[1], idx);
- vst2q_u8((uint8_t*) inputPtr, input);
-
- inputPtr += 8;
- }
-
- for(number = n4points * 4; number < num_points; ++number){
- uint32_t output1 = *inputPtr;
- uint32_t output2 = inputPtr[1];
+static inline void volk_64u_byteswap_neonv8(uint64_t* intsToSwap, unsigned int num_points)
+{
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ const unsigned int n4points = num_points / 4;
+ uint8x16x2_t input;
+ uint8x16_t idx = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+
+ unsigned int number = 0;
+ for (number = 0; number < n4points; ++number) {
+ __VOLK_PREFETCH(inputPtr + 8);
+ input = vld2q_u8((uint8_t*)inputPtr);
+ input.val[0] = vqtbl1q_u8(input.val[0], idx);
+ input.val[1] = vqtbl1q_u8(input.val[1], idx);
+ vst2q_u8((uint8_t*)inputPtr, input);
+
+ inputPtr += 8;
+ }
- output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
- output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+ for (number = n4points * 4; number < num_points; ++number) {
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
- *inputPtr++ = output2;
- *inputPtr++ = output1;
- }
+ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
+ ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
+ ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+ *inputPtr++ = output2;
+ *inputPtr++ = output1;
+ }
}
#else
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points){
- uint32_t* inputPtr = (uint32_t*)intsToSwap;
- unsigned int number = 0;
- unsigned int n8points = num_points / 4;
-
- uint8x8x4_t input_table;
- uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
- uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
-
- /* these magic numbers are used as byte-indices in the LUT.
- they are pre-computed to save time. A simple C program
- can calculate them; for example for lookup01:
- uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
- for(ii=0; ii < 8; ++ii) {
- index += ((uint64_t)(*(chars+ii))) << (ii*8);
+static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points)
+{
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ unsigned int number = 0;
+ unsigned int n8points = num_points / 4;
+
+ uint8x8x4_t input_table;
+ uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
+ uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
+
+ /* these magic numbers are used as byte-indices in the LUT.
+ they are pre-computed to save time. A simple C program
+ can calculate them; for example for lookup01:
+ uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
+ for(ii=0; ii < 8; ++ii) {
+ index += ((uint64_t)(*(chars+ii))) << (ii*8);
+ }
+ */
+ int_lookup01 = vcreate_u8(2269495096316185);
+ int_lookup23 = vcreate_u8(146949840772469531);
+ int_lookup45 = vcreate_u8(291630186448622877);
+ int_lookup67 = vcreate_u8(436310532124776223);
+
+ for (number = 0; number < n8points; ++number) {
+ input_table = vld4_u8((uint8_t*)inputPtr);
+ swapped_int01 = vtbl4_u8(input_table, int_lookup01);
+ swapped_int23 = vtbl4_u8(input_table, int_lookup23);
+ swapped_int45 = vtbl4_u8(input_table, int_lookup45);
+ swapped_int67 = vtbl4_u8(input_table, int_lookup67);
+ vst1_u8((uint8_t*)inputPtr, swapped_int01);
+ vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23);
+ vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45);
+ vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67);
+
+ inputPtr += 4;
}
- */
- int_lookup01 = vcreate_u8(2269495096316185);
- int_lookup23 = vcreate_u8(146949840772469531);
- int_lookup45 = vcreate_u8(291630186448622877);
- int_lookup67 = vcreate_u8(436310532124776223);
-
- for(number = 0; number < n8points; ++number){
- input_table = vld4_u8((uint8_t*) inputPtr);
- swapped_int01 = vtbl4_u8(input_table, int_lookup01);
- swapped_int23 = vtbl4_u8(input_table, int_lookup23);
- swapped_int45 = vtbl4_u8(input_table, int_lookup45);
- swapped_int67 = vtbl4_u8(input_table, int_lookup67);
- vst1_u8((uint8_t*) inputPtr, swapped_int01);
- vst1_u8((uint8_t*) (inputPtr+2), swapped_int23);
- vst1_u8((uint8_t*) (inputPtr+4), swapped_int45);
- vst1_u8((uint8_t*) (inputPtr+6), swapped_int67);
-
- inputPtr += 4;
- }
-
- for(number = n8points * 4; number < num_points; ++number){
- uint32_t output1 = *inputPtr;
- uint32_t output2 = inputPtr[1];
-
- output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
- output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
-
- *inputPtr++ = output2;
- *inputPtr++ = output1;
- }
+ for (number = n8points * 4; number < num_points; ++number) {
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+
+ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
+ ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
+ ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+ *inputPtr++ = output2;
+ *inputPtr++ = output1;
+ }
}
#endif /* LV_HAVE_NEON */
#endif
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
-static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points){
+static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points)
+{
uint32_t* inputPtr = (uint32_t*)intsToSwap;
__m128i input, byte1, byte2, byte3, byte4, output;
__m128i byte2mask = _mm_set1_epi32(0x00FF0000);
__m128i byte3mask = _mm_set1_epi32(0x0000FF00);
uint64_t number = 0;
const unsigned int halfPoints = num_points / 2;
- for(;number < halfPoints; number++){
- // Load the 32t values, increment inputPtr later since we're doing it in-place.
- input = _mm_load_si128((__m128i*)inputPtr);
-
- // Do the four shifts
- byte1 = _mm_slli_epi32(input, 24);
- byte2 = _mm_slli_epi32(input, 8);
- byte3 = _mm_srli_epi32(input, 8);
- byte4 = _mm_srli_epi32(input, 24);
- // Or bytes together
- output = _mm_or_si128(byte1, byte4);
- byte2 = _mm_and_si128(byte2, byte2mask);
- output = _mm_or_si128(output, byte2);
- byte3 = _mm_and_si128(byte3, byte3mask);
- output = _mm_or_si128(output, byte3);
-
- // Reorder the two words
- output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
-
- // Store the results
- _mm_store_si128((__m128i*)inputPtr, output);
- inputPtr += 4;
+ for (; number < halfPoints; number++) {
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_load_si128((__m128i*)inputPtr);
+
+ // Do the four shifts
+ byte1 = _mm_slli_epi32(input, 24);
+ byte2 = _mm_slli_epi32(input, 8);
+ byte3 = _mm_srli_epi32(input, 8);
+ byte4 = _mm_srli_epi32(input, 24);
+ // Or bytes together
+ output = _mm_or_si128(byte1, byte4);
+ byte2 = _mm_and_si128(byte2, byte2mask);
+ output = _mm_or_si128(output, byte2);
+ byte3 = _mm_and_si128(byte3, byte3mask);
+ output = _mm_or_si128(output, byte3);
+
+ // Reorder the two words
+ output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
+
+ // Store the results
+ _mm_store_si128((__m128i*)inputPtr, output);
+ inputPtr += 4;
}
// Byteswap any remaining points:
- number = halfPoints*2;
- for(; number < num_points; number++){
- uint32_t output1 = *inputPtr;
- uint32_t output2 = inputPtr[1];
+ number = halfPoints * 2;
+ for (; number < num_points; number++) {
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
- output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
+ ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
- output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
+ ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
- *inputPtr++ = output2;
- *inputPtr++ = output1;
+ *inputPtr++ = output2;
+ *inputPtr++ = output1;
}
}
#endif /* LV_HAVE_SSE2 */
#include <immintrin.h>
static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap, unsigned int num_points)
{
- unsigned int number = 0;
-
- const unsigned int nPerSet = 4;
- const uint64_t nSets = num_points / nPerSet;
-
- uint32_t* inputPtr = (uint32_t*)intsToSwap;
-
- const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
-
- const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]);
-
- for ( ;number < nSets; number++ ) {
- // Load the 32t values, increment inputPtr later since we're doing it in-place.
- const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
- const __m256i output = _mm256_shuffle_epi8(input,myShuffle);
-
- // Store the results
- _mm256_storeu_si256((__m256i*)inputPtr, output);
-
- /* inputPtr is 32bit so increment twice */
- inputPtr += 2 * nPerSet;
- }
- _mm256_zeroupper();
-
- // Byteswap any remaining points:
- for(number = nSets * nPerSet; number < num_points; ++number ) {
- uint32_t output1 = *inputPtr;
- uint32_t output2 = inputPtr[1];
- uint32_t out1 = ((((output1) >> 24) & 0x000000ff) |
- (((output1) >> 8) & 0x0000ff00) |
- (((output1) << 8) & 0x00ff0000) |
- (((output1) << 24) & 0xff000000) );
-
- uint32_t out2 = ((((output2) >> 24) & 0x000000ff) |
- (((output2) >> 8) & 0x0000ff00) |
- (((output2) << 8) & 0x00ff0000) |
- (((output2) << 24) & 0xff000000) );
- *inputPtr++ = out2;
- *inputPtr++ = out1;
- }
+ unsigned int number = 0;
+
+ const unsigned int nPerSet = 4;
+ const uint64_t nSets = num_points / nPerSet;
+
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+
+ const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
+ 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
+ 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
+
+ const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
+
+ for (; number < nSets; number++) {
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
+ const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
+
+ // Store the results
+ _mm256_storeu_si256((__m256i*)inputPtr, output);
+
+ /* inputPtr is 32bit so increment twice */
+ inputPtr += 2 * nPerSet;
+ }
+ _mm256_zeroupper();
+
+ // Byteswap any remaining points:
+ for (number = nSets * nPerSet; number < num_points; ++number) {
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+ uint32_t out1 =
+ ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
+ (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
+
+ uint32_t out2 =
+ ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
+ (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
+ *inputPtr++ = out2;
+ *inputPtr++ = out1;
+ }
}
#endif /* LV_HAVE_AVX2 */
#if LV_HAVE_SSSE3
#include <tmmintrin.h>
-static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap, unsigned int num_points)
+static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap,
+ unsigned int num_points)
{
- unsigned int number = 0;
-
- const unsigned int nPerSet = 2;
- const uint64_t nSets = num_points / nPerSet;
+ unsigned int number = 0;
- uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ const unsigned int nPerSet = 2;
+ const uint64_t nSets = num_points / nPerSet;
- uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
- const __m128i myShuffle = _mm_loadu_si128((__m128i*) &shuffleVector);
+ uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
- for ( ;number < nSets; number++ ) {
- // Load the 32t values, increment inputPtr later since we're doing it in-place.
- const __m128i input = _mm_loadu_si128((__m128i*)inputPtr);
- const __m128i output = _mm_shuffle_epi8(input,myShuffle);
+ const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
- // Store the results
- _mm_storeu_si128((__m128i*)inputPtr, output);
+ for (; number < nSets; number++) {
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ const __m128i input = _mm_loadu_si128((__m128i*)inputPtr);
+ const __m128i output = _mm_shuffle_epi8(input, myShuffle);
- /* inputPtr is 32bit so increment twice */
- inputPtr += 2 * nPerSet;
- }
+ // Store the results
+ _mm_storeu_si128((__m128i*)inputPtr, output);
- // Byteswap any remaining points:
- for(number = nSets * nPerSet; number < num_points; ++number ) {
- uint32_t output1 = *inputPtr;
- uint32_t output2 = inputPtr[1];
- uint32_t out1 = ((((output1) >> 24) & 0x000000ff) |
- (((output1) >> 8) & 0x0000ff00) |
- (((output1) << 8) & 0x00ff0000) |
- (((output1) << 24) & 0xff000000) );
+ /* inputPtr is 32bit so increment twice */
+ inputPtr += 2 * nPerSet;
+ }
- uint32_t out2 = ((((output2) >> 24) & 0x000000ff) |
- (((output2) >> 8) & 0x0000ff00) |
- (((output2) << 8) & 0x00ff0000) |
- (((output2) << 24) & 0xff000000) );
- *inputPtr++ = out2;
- *inputPtr++ = out1;
- }
+ // Byteswap any remaining points:
+ for (number = nSets * nPerSet; number < num_points; ++number) {
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+ uint32_t out1 =
+ ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
+ (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
+
+ uint32_t out2 =
+ ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
+ (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
+ *inputPtr++ = out2;
+ *inputPtr++ = out1;
+ }
}
#endif /* LV_HAVE_SSSE3 */
#ifdef LV_HAVE_GENERIC
-static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap, unsigned int num_points){
- uint32_t* inputPtr = (uint32_t*)intsToSwap;
- unsigned int point;
- for(point = 0; point < num_points; point++){
- uint32_t output1 = *inputPtr;
- uint32_t output2 = inputPtr[1];
+static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap,
+ unsigned int num_points)
+{
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ unsigned int point;
+ for (point = 0; point < num_points; point++) {
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
- output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
+ ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
- output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
+ ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
- *inputPtr++ = output2;
- *inputPtr++ = output1;
- }
+ *inputPtr++ = output2;
+ *inputPtr++ = output1;
+ }
}
#endif /* LV_HAVE_GENERIC */
-
-
#endif /* INCLUDED_volk_64u_byteswap_a_H */
#include <stdint.h>
-#include <volk/volk_64u_byteswap.h>
#include <string.h>
+#include <volk/volk_64u_byteswap.h>
#ifdef LV_HAVE_GENERIC
-static inline void volk_64u_byteswappuppet_64u_generic(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){
+static inline void volk_64u_byteswappuppet_64u_generic(uint64_t* output,
+ uint64_t* intsToSwap,
+ unsigned int num_points)
+{
volk_64u_byteswap_generic((uint64_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
-
}
#endif
#ifdef LV_HAVE_NEONV8
-static inline void volk_64u_byteswappuppet_64u_neonv8(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){
+static inline void volk_64u_byteswappuppet_64u_neonv8(uint64_t* output,
+ uint64_t* intsToSwap,
+ unsigned int num_points)
+{
volk_64u_byteswap_neonv8((uint64_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
-
}
#else
#ifdef LV_HAVE_NEON
-static inline void volk_64u_byteswappuppet_64u_neon(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){
+static inline void volk_64u_byteswappuppet_64u_neon(uint64_t* output,
+ uint64_t* intsToSwap,
+ unsigned int num_points)
+{
volk_64u_byteswap_neon((uint64_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
-
}
#endif
#endif
#ifdef LV_HAVE_SSE2
-static inline void volk_64u_byteswappuppet_64u_u_sse2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
+static inline void volk_64u_byteswappuppet_64u_u_sse2(uint64_t* output,
+ uint64_t* intsToSwap,
+ unsigned int num_points)
+{
volk_64u_byteswap_u_sse2((uint64_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
-
}
#endif
#ifdef LV_HAVE_SSE2
-static inline void volk_64u_byteswappuppet_64u_a_sse2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
+static inline void volk_64u_byteswappuppet_64u_a_sse2(uint64_t* output,
+ uint64_t* intsToSwap,
+ unsigned int num_points)
+{
volk_64u_byteswap_a_sse2((uint64_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
-
}
#endif
#ifdef LV_HAVE_SSSE3
-static inline void volk_64u_byteswappuppet_64u_u_ssse3(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
+static inline void volk_64u_byteswappuppet_64u_u_ssse3(uint64_t* output,
+ uint64_t* intsToSwap,
+ unsigned int num_points)
+{
volk_64u_byteswap_u_ssse3((uint64_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
-
}
#endif
#ifdef LV_HAVE_SSSE3
-static inline void volk_64u_byteswappuppet_64u_a_ssse3(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
+static inline void volk_64u_byteswappuppet_64u_a_ssse3(uint64_t* output,
+ uint64_t* intsToSwap,
+ unsigned int num_points)
+{
volk_64u_byteswap_a_ssse3((uint64_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
-
}
#endif
#ifdef LV_HAVE_AVX2
-static inline void volk_64u_byteswappuppet_64u_u_avx2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
+static inline void volk_64u_byteswappuppet_64u_u_avx2(uint64_t* output,
+ uint64_t* intsToSwap,
+ unsigned int num_points)
+{
volk_64u_byteswap_u_avx2((uint64_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
-
}
#endif
#ifdef LV_HAVE_AVX2
-static inline void volk_64u_byteswappuppet_64u_a_avx2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
+static inline void volk_64u_byteswappuppet_64u_a_avx2(uint64_t* output,
+ uint64_t* intsToSwap,
+ unsigned int num_points)
+{
volk_64u_byteswap_a_avx2((uint64_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
-
}
#endif
#ifndef INCLUDED_volk_64u_popcnt_a_H
#define INCLUDED_volk_64u_popcnt_a_H
-#include <stdio.h>
#include <inttypes.h>
+#include <stdio.h>
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_64u_popcnt_generic(uint64_t* ret, const uint64_t value)
+static inline void volk_64u_popcnt_generic(uint64_t* ret, const uint64_t value)
{
- //const uint32_t* valueVector = (const uint32_t*)&value;
-
- // This is faster than a lookup table
- //uint32_t retVal = valueVector[0];
- uint32_t retVal = (uint32_t)(value & 0x00000000FFFFFFFFull);
-
- retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
- retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
- retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
- retVal = (retVal + (retVal >> 8));
- retVal = (retVal + (retVal >> 16)) & 0x0000003F;
- uint64_t retVal64 = retVal;
-
- //retVal = valueVector[1];
- retVal = (uint32_t)((value & 0xFFFFFFFF00000000ull) >> 32);
- retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
- retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
- retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
- retVal = (retVal + (retVal >> 8));
- retVal = (retVal + (retVal >> 16)) & 0x0000003F;
- retVal64 += retVal;
-
- *ret = retVal64;
+ // const uint32_t* valueVector = (const uint32_t*)&value;
+
+ // This is faster than a lookup table
+ // uint32_t retVal = valueVector[0];
+ uint32_t retVal = (uint32_t)(value & 0x00000000FFFFFFFFull);
+
+ retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+ retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+ retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+ retVal = (retVal + (retVal >> 8));
+ retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+ uint64_t retVal64 = retVal;
+
+ // retVal = valueVector[1];
+ retVal = (uint32_t)((value & 0xFFFFFFFF00000000ull) >> 32);
+ retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+ retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+ retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+ retVal = (retVal + (retVal >> 8));
+ retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+ retVal64 += retVal;
+
+ *ret = retVal64;
}
#endif /*LV_HAVE_GENERIC*/
static inline void volk_64u_popcnt_a_sse4_2(uint64_t* ret, const uint64_t value)
{
- *ret = _mm_popcnt_u64(value);
+ *ret = _mm_popcnt_u64(value);
}
#endif /*LV_HAVE_SSE4_2*/
#include <arm_neon.h>
static inline void volk_64u_popcnt_neon(uint64_t* ret, const uint64_t value)
{
- uint8x8_t input_val, count8x8_val;
- uint16x4_t count16x4_val;
- uint32x2_t count32x2_val;
- uint64x1_t count64x1_val;
-
- input_val = vld1_u8((unsigned char *) &value);
- count8x8_val = vcnt_u8(input_val);
- count16x4_val = vpaddl_u8(count8x8_val);
- count32x2_val = vpaddl_u16(count16x4_val);
- count64x1_val = vpaddl_u32(count32x2_val);
- vst1_u64(ret, count64x1_val);
-
- //*ret = _mm_popcnt_u64(value);
+ uint8x8_t input_val, count8x8_val;
+ uint16x4_t count16x4_val;
+ uint32x2_t count32x2_val;
+ uint64x1_t count64x1_val;
+
+ input_val = vld1_u8((unsigned char*)&value);
+ count8x8_val = vcnt_u8(input_val);
+ count16x4_val = vpaddl_u8(count8x8_val);
+ count32x2_val = vpaddl_u16(count16x4_val);
+ count64x1_val = vpaddl_u32(count32x2_val);
+ vst1_u64(ret, count64x1_val);
+
+ //*ret = _mm_popcnt_u64(value);
}
#endif /*LV_HAVE_NEON*/
#ifndef INCLUDED_volk_64u_popcntpuppet_64u_H
#define INCLUDED_volk_64u_popcntpuppet_64u_H
-#include <volk/volk_64u_popcnt.h>
#include <stdint.h>
#include <string.h>
+#include <volk/volk_64u_popcnt.h>
#ifdef LV_HAVE_GENERIC
-static inline void volk_64u_popcntpuppet_64u_generic(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){
+static inline void volk_64u_popcntpuppet_64u_generic(uint64_t* outVector,
+ const uint64_t* inVector,
+ unsigned int num_points)
+{
unsigned int ii;
- for(ii=0; ii < num_points; ++ii) {
- volk_64u_popcnt_generic(outVector+ii, num_points );
+ for (ii = 0; ii < num_points; ++ii) {
+ volk_64u_popcnt_generic(outVector + ii, num_points);
}
memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t));
}
#endif /* LV_HAVE_GENERIC */
#if LV_HAVE_SSE4_2 && LV_HAVE_64
-static inline void volk_64u_popcntpuppet_64u_a_sse4_2(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){
+static inline void volk_64u_popcntpuppet_64u_a_sse4_2(uint64_t* outVector,
+ const uint64_t* inVector,
+ unsigned int num_points)
+{
unsigned int ii;
- for(ii=0; ii < num_points; ++ii) {
- volk_64u_popcnt_a_sse4_2(outVector+ii, num_points );
+ for (ii = 0; ii < num_points; ++ii) {
+ volk_64u_popcnt_a_sse4_2(outVector + ii, num_points);
}
memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t));
}
#endif /* LV_HAVE_SSE4_2 */
#ifdef LV_HAVE_NEON
-static inline void volk_64u_popcntpuppet_64u_neon(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){
+static inline void volk_64u_popcntpuppet_64u_neon(uint64_t* outVector,
+ const uint64_t* inVector,
+ unsigned int num_points)
+{
unsigned int ii;
- for(ii=0; ii < num_points; ++ii) {
- volk_64u_popcnt_neon(outVector+ii, num_points );
+ for (ii = 0; ii < num_points; ++ii) {
+ volk_64u_popcnt_neon(outVector + ii, num_points);
}
memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t));
}
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_8i_convert_16i(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points)
- * \endcode
+ * void volk_8i_convert_16i(int16_t* outputVector, const int8_t* inputVector, unsigned int
+ * num_points) \endcode
*
* \b Inputs
* \li inputVector: The input vector of 8-bit chars.
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_8i_convert_16i_u_avx2(int16_t* outputVector, const int8_t* inputVector,
- unsigned int num_points)
+static inline void volk_8i_convert_16i_u_avx2(int16_t* outputVector,
+ const int8_t* inputVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- const __m128i* inputVectorPtr = (const __m128i*)inputVector;
- __m256i* outputVectorPtr = (__m256i*)outputVector;
- __m128i inputVal;
- __m256i ret;
-
- for(;number < sixteenthPoints; number++){
- inputVal = _mm_loadu_si128(inputVectorPtr);
- ret = _mm256_cvtepi8_epi16(inputVal);
- ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
- _mm256_storeu_si256(outputVectorPtr, ret);
-
- outputVectorPtr++;
- inputVectorPtr++;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- outputVector[number] = (int16_t)(inputVector[number])*256;
- }
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+ __m256i* outputVectorPtr = (__m256i*)outputVector;
+ __m128i inputVal;
+ __m256i ret;
+
+ for (; number < sixteenthPoints; number++) {
+ inputVal = _mm_loadu_si128(inputVectorPtr);
+ ret = _mm256_cvtepi8_epi16(inputVal);
+ ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
+ _mm256_storeu_si256(outputVectorPtr, ret);
+
+ outputVectorPtr++;
+ inputVectorPtr++;
+ }
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ outputVector[number] = (int16_t)(inputVector[number]) * 256;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
-static inline void
-volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, const int8_t* inputVector,
- unsigned int num_points)
+static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector,
+ const int8_t* inputVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- const __m128i* inputVectorPtr = (const __m128i*)inputVector;
- __m128i* outputVectorPtr = (__m128i*)outputVector;
- __m128i inputVal;
- __m128i ret;
+ const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+ __m128i* outputVectorPtr = (__m128i*)outputVector;
+ __m128i inputVal;
+ __m128i ret;
- for(;number < sixteenthPoints; number++){
- inputVal = _mm_loadu_si128(inputVectorPtr);
- ret = _mm_cvtepi8_epi16(inputVal);
- ret = _mm_slli_epi16(ret, 8); // Multiply by 256
- _mm_storeu_si128(outputVectorPtr, ret);
+ for (; number < sixteenthPoints; number++) {
+ inputVal = _mm_loadu_si128(inputVectorPtr);
+ ret = _mm_cvtepi8_epi16(inputVal);
+ ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+ _mm_storeu_si128(outputVectorPtr, ret);
- outputVectorPtr++;
+ outputVectorPtr++;
- inputVal = _mm_srli_si128(inputVal, 8);
- ret = _mm_cvtepi8_epi16(inputVal);
- ret = _mm_slli_epi16(ret, 8); // Multiply by 256
- _mm_storeu_si128(outputVectorPtr, ret);
+ inputVal = _mm_srli_si128(inputVal, 8);
+ ret = _mm_cvtepi8_epi16(inputVal);
+ ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+ _mm_storeu_si128(outputVectorPtr, ret);
- outputVectorPtr++;
+ outputVectorPtr++;
- inputVectorPtr++;
- }
+ inputVectorPtr++;
+ }
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- outputVector[number] = (int16_t)(inputVector[number])*256;
- }
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ outputVector[number] = (int16_t)(inputVector[number]) * 256;
+ }
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_8i_convert_16i_generic(int16_t* outputVector, const int8_t* inputVector,
- unsigned int num_points)
+static inline void volk_8i_convert_16i_generic(int16_t* outputVector,
+ const int8_t* inputVector,
+ unsigned int num_points)
{
- int16_t* outputVectorPtr = outputVector;
- const int8_t* inputVectorPtr = inputVector;
- unsigned int number = 0;
+ int16_t* outputVectorPtr = outputVector;
+ const int8_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
- }
+ for (number = 0; number < num_points; number++) {
+ *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+ }
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
-
#ifndef INCLUDED_volk_8i_convert_16i_a_H
#define INCLUDED_volk_8i_convert_16i_a_H
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_8i_convert_16i_a_avx2(int16_t* outputVector, const int8_t* inputVector,
- unsigned int num_points)
+static inline void volk_8i_convert_16i_a_avx2(int16_t* outputVector,
+ const int8_t* inputVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- const __m128i* inputVectorPtr = (const __m128i*)inputVector;
- __m256i* outputVectorPtr = (__m256i*)outputVector;
- __m128i inputVal;
- __m256i ret;
-
- for(;number < sixteenthPoints; number++){
- inputVal = _mm_load_si128(inputVectorPtr);
- ret = _mm256_cvtepi8_epi16(inputVal);
- ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
- _mm256_store_si256(outputVectorPtr, ret);
-
- outputVectorPtr++;
- inputVectorPtr++;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- outputVector[number] = (int16_t)(inputVector[number])*256;
- }
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+ __m256i* outputVectorPtr = (__m256i*)outputVector;
+ __m128i inputVal;
+ __m256i ret;
+
+ for (; number < sixteenthPoints; number++) {
+ inputVal = _mm_load_si128(inputVectorPtr);
+ ret = _mm256_cvtepi8_epi16(inputVal);
+ ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
+ _mm256_store_si256(outputVectorPtr, ret);
+
+ outputVectorPtr++;
+ inputVectorPtr++;
+ }
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ outputVector[number] = (int16_t)(inputVector[number]) * 256;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
-static inline void
-volk_8i_convert_16i_a_sse4_1(int16_t* outputVector, const int8_t* inputVector,
- unsigned int num_points)
+static inline void volk_8i_convert_16i_a_sse4_1(int16_t* outputVector,
+ const int8_t* inputVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
- const __m128i* inputVectorPtr = (const __m128i*)inputVector;
- __m128i* outputVectorPtr = (__m128i*)outputVector;
- __m128i inputVal;
- __m128i ret;
+ const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+ __m128i* outputVectorPtr = (__m128i*)outputVector;
+ __m128i inputVal;
+ __m128i ret;
- for(;number < sixteenthPoints; number++){
- inputVal = _mm_load_si128(inputVectorPtr);
- ret = _mm_cvtepi8_epi16(inputVal);
- ret = _mm_slli_epi16(ret, 8); // Multiply by 256
- _mm_store_si128(outputVectorPtr, ret);
+ for (; number < sixteenthPoints; number++) {
+ inputVal = _mm_load_si128(inputVectorPtr);
+ ret = _mm_cvtepi8_epi16(inputVal);
+ ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+ _mm_store_si128(outputVectorPtr, ret);
- outputVectorPtr++;
+ outputVectorPtr++;
- inputVal = _mm_srli_si128(inputVal, 8);
- ret = _mm_cvtepi8_epi16(inputVal);
- ret = _mm_slli_epi16(ret, 8); // Multiply by 256
- _mm_store_si128(outputVectorPtr, ret);
+ inputVal = _mm_srli_si128(inputVal, 8);
+ ret = _mm_cvtepi8_epi16(inputVal);
+ ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+ _mm_store_si128(outputVectorPtr, ret);
- outputVectorPtr++;
+ outputVectorPtr++;
- inputVectorPtr++;
- }
+ inputVectorPtr++;
+ }
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- outputVector[number] = (int16_t)(inputVector[number])*256;
- }
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ outputVector[number] = (int16_t)(inputVector[number]) * 256;
+ }
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_8i_convert_16i_a_generic(int16_t* outputVector, const int8_t* inputVector,
- unsigned int num_points)
+static inline void volk_8i_convert_16i_a_generic(int16_t* outputVector,
+ const int8_t* inputVector,
+ unsigned int num_points)
{
- int16_t* outputVectorPtr = outputVector;
- const int8_t* inputVectorPtr = inputVector;
- unsigned int number = 0;
+ int16_t* outputVectorPtr = outputVector;
+ const int8_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
- }
+ for (number = 0; number < num_points; number++) {
+ *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_8i_convert_16i_neon(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points)
+static inline void volk_8i_convert_16i_neon(int16_t* outputVector,
+ const int8_t* inputVector,
+ unsigned int num_points)
{
- int16_t* outputVectorPtr = outputVector;
- const int8_t* inputVectorPtr = inputVector;
- unsigned int number;
- const unsigned int eighth_points = num_points / 8;
-
- int8x8_t input_vec ;
- int16x8_t converted_vec;
-
- // NEON doesn't have a concept of 8 bit registers, so we are really
- // dealing with the low half of 16-bit registers. Since this requires
- // a move instruction we likely do better with ASM here.
- for(number = 0; number < eighth_points; ++number) {
- input_vec = vld1_s8(inputVectorPtr);
- converted_vec = vmovl_s8(input_vec);
- //converted_vec = vmulq_s16(converted_vec, scale_factor);
- converted_vec = vshlq_n_s16(converted_vec, 8);
- vst1q_s16( outputVectorPtr, converted_vec);
-
- inputVectorPtr += 8;
- outputVectorPtr += 8;
- }
-
- for(number = eighth_points * 8; number < num_points; number++){
- *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
- }
+ int16_t* outputVectorPtr = outputVector;
+ const int8_t* inputVectorPtr = inputVector;
+ unsigned int number;
+ const unsigned int eighth_points = num_points / 8;
+
+ int8x8_t input_vec;
+ int16x8_t converted_vec;
+
+ // NEON doesn't have a concept of 8 bit registers, so we are really
+ // dealing with the low half of 16-bit registers. Since this requires
+ // a move instruction we likely do better with ASM here.
+ for (number = 0; number < eighth_points; ++number) {
+ input_vec = vld1_s8(inputVectorPtr);
+ converted_vec = vmovl_s8(input_vec);
+ // converted_vec = vmulq_s16(converted_vec, scale_factor);
+ converted_vec = vshlq_n_s16(converted_vec, 8);
+ vst1q_s16(outputVectorPtr, converted_vec);
+
+ inputVectorPtr += 8;
+ outputVectorPtr += 8;
+ }
+
+ for (number = eighth_points * 8; number < num_points; number++) {
+ *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_ORC
-extern void
-volk_8i_convert_16i_a_orc_impl(int16_t* outputVector, const int8_t* inputVector,
- unsigned int num_points);
+extern void volk_8i_convert_16i_a_orc_impl(int16_t* outputVector,
+ const int8_t* inputVector,
+ unsigned int num_points);
-static inline void
-volk_8i_convert_16i_u_orc(int16_t* outputVector, const int8_t* inputVector,
- unsigned int num_points)
+static inline void volk_8i_convert_16i_u_orc(int16_t* outputVector,
+ const int8_t* inputVector,
+ unsigned int num_points)
{
- volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points);
+ volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points);
}
#endif /* LV_HAVE_ORC */
-
#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_8i_s32f_convert_32f(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points)
- * \endcode
+ * void volk_8i_s32f_convert_32f(float* outputVector, const int8_t* inputVector, const
+ * float scalar, unsigned int num_points) \endcode
*
* \b Inputs
* \li inputVector: The input vector of 8-bit chars.
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_8i_s32f_convert_32f_u_avx2(float* outputVector, const int8_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_8i_s32f_convert_32f_u_avx2(float* outputVector,
+ const int8_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float* outputVectorPtr = outputVector;
- const float iScalar = 1.0 / scalar;
- __m256 invScalar = _mm256_set1_ps( iScalar );
- const int8_t* inputVectorPtr = inputVector;
- __m256 ret;
- __m128i inputVal128;
- __m256i interimVal;
-
- for(;number < sixteenthPoints; number++){
- inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
-
- interimVal = _mm256_cvtepi8_epi32(inputVal128);
- ret = _mm256_cvtepi32_ps(interimVal);
- ret = _mm256_mul_ps(ret, invScalar);
- _mm256_storeu_ps(outputVectorPtr, ret);
- outputVectorPtr += 8;
-
- inputVal128 = _mm_srli_si128(inputVal128, 8);
- interimVal = _mm256_cvtepi8_epi32(inputVal128);
- ret = _mm256_cvtepi32_ps(interimVal);
- ret = _mm256_mul_ps(ret, invScalar);
- _mm256_storeu_ps(outputVectorPtr, ret);
- outputVectorPtr += 8;
-
- inputVectorPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- outputVector[number] = (float)(inputVector[number]) * iScalar;
- }
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m256 invScalar = _mm256_set1_ps(iScalar);
+ const int8_t* inputVectorPtr = inputVector;
+ __m256 ret;
+ __m128i inputVal128;
+ __m256i interimVal;
+
+ for (; number < sixteenthPoints; number++) {
+ inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
+
+ interimVal = _mm256_cvtepi8_epi32(inputVal128);
+ ret = _mm256_cvtepi32_ps(interimVal);
+ ret = _mm256_mul_ps(ret, invScalar);
+ _mm256_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 8;
+
+ inputVal128 = _mm_srli_si128(inputVal128, 8);
+ interimVal = _mm256_cvtepi8_epi32(inputVal128);
+ ret = _mm256_cvtepi32_ps(interimVal);
+ ret = _mm256_mul_ps(ret, invScalar);
+ _mm256_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 8;
+
+ inputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ outputVector[number] = (float)(inputVector[number]) * iScalar;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
-static inline void
-volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const int8_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector,
+ const int8_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float* outputVectorPtr = outputVector;
- const float iScalar = 1.0 / scalar;
- __m128 invScalar = _mm_set_ps1( iScalar );
- const int8_t* inputVectorPtr = inputVector;
- __m128 ret;
- __m128i inputVal;
- __m128i interimVal;
-
- for(;number < sixteenthPoints; number++){
- inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
-
- interimVal = _mm_cvtepi8_epi32(inputVal);
- ret = _mm_cvtepi32_ps(interimVal);
- ret = _mm_mul_ps(ret, invScalar);
- _mm_storeu_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
-
- inputVal = _mm_srli_si128(inputVal, 4);
- interimVal = _mm_cvtepi8_epi32(inputVal);
- ret = _mm_cvtepi32_ps(interimVal);
- ret = _mm_mul_ps(ret, invScalar);
- _mm_storeu_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
-
- inputVal = _mm_srli_si128(inputVal, 4);
- interimVal = _mm_cvtepi8_epi32(inputVal);
- ret = _mm_cvtepi32_ps(interimVal);
- ret = _mm_mul_ps(ret, invScalar);
- _mm_storeu_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
-
- inputVal = _mm_srli_si128(inputVal, 4);
- interimVal = _mm_cvtepi8_epi32(inputVal);
- ret = _mm_cvtepi32_ps(interimVal);
- ret = _mm_mul_ps(ret, invScalar);
- _mm_storeu_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
-
- inputVectorPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- outputVector[number] = (float)(inputVector[number]) * iScalar;
- }
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ const int8_t* inputVectorPtr = inputVector;
+ __m128 ret;
+ __m128i inputVal;
+ __m128i interimVal;
+
+ for (; number < sixteenthPoints; number++) {
+ inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
+
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ outputVector[number] = (float)(inputVector[number]) * iScalar;
+ }
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_8i_s32f_convert_32f_generic(float* outputVector, const int8_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_8i_s32f_convert_32f_generic(float* outputVector,
+ const int8_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- float* outputVectorPtr = outputVector;
- const int8_t* inputVectorPtr = inputVector;
- unsigned int number = 0;
- const float iScalar = 1.0 / scalar;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
- }
+ float* outputVectorPtr = outputVector;
+ const int8_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ const float iScalar = 1.0 / scalar;
+
+ for (number = 0; number < num_points; number++) {
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+ }
}
#endif /* LV_HAVE_GENERIC */
-
#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
#ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_8i_s32f_convert_32f_a_avx2(float* outputVector, const int8_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_8i_s32f_convert_32f_a_avx2(float* outputVector,
+ const int8_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float* outputVectorPtr = outputVector;
- const float iScalar = 1.0 / scalar;
- __m256 invScalar = _mm256_set1_ps( iScalar );
- const int8_t* inputVectorPtr = inputVector;
- __m256 ret;
- __m128i inputVal128;
- __m256i interimVal;
-
- for(;number < sixteenthPoints; number++){
- inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
-
- interimVal = _mm256_cvtepi8_epi32(inputVal128);
- ret = _mm256_cvtepi32_ps(interimVal);
- ret = _mm256_mul_ps(ret, invScalar);
- _mm256_store_ps(outputVectorPtr, ret);
- outputVectorPtr += 8;
-
- inputVal128 = _mm_srli_si128(inputVal128, 8);
- interimVal = _mm256_cvtepi8_epi32(inputVal128);
- ret = _mm256_cvtepi32_ps(interimVal);
- ret = _mm256_mul_ps(ret, invScalar);
- _mm256_store_ps(outputVectorPtr, ret);
- outputVectorPtr += 8;
-
- inputVectorPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- outputVector[number] = (float)(inputVector[number]) * iScalar;
- }
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m256 invScalar = _mm256_set1_ps(iScalar);
+ const int8_t* inputVectorPtr = inputVector;
+ __m256 ret;
+ __m128i inputVal128;
+ __m256i interimVal;
+
+ for (; number < sixteenthPoints; number++) {
+ inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
+
+ interimVal = _mm256_cvtepi8_epi32(inputVal128);
+ ret = _mm256_cvtepi32_ps(interimVal);
+ ret = _mm256_mul_ps(ret, invScalar);
+ _mm256_store_ps(outputVectorPtr, ret);
+ outputVectorPtr += 8;
+
+ inputVal128 = _mm_srli_si128(inputVal128, 8);
+ interimVal = _mm256_cvtepi8_epi32(inputVal128);
+ ret = _mm256_cvtepi32_ps(interimVal);
+ ret = _mm256_mul_ps(ret, invScalar);
+ _mm256_store_ps(outputVectorPtr, ret);
+ outputVectorPtr += 8;
+
+ inputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ outputVector[number] = (float)(inputVector[number]) * iScalar;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
-static inline void
-volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector, const int8_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector,
+ const int8_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float* outputVectorPtr = outputVector;
- const float iScalar = 1.0 / scalar;
- __m128 invScalar = _mm_set_ps1(iScalar);
- const int8_t* inputVectorPtr = inputVector;
- __m128 ret;
- __m128i inputVal;
- __m128i interimVal;
-
- for(;number < sixteenthPoints; number++){
- inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
-
- interimVal = _mm_cvtepi8_epi32(inputVal);
- ret = _mm_cvtepi32_ps(interimVal);
- ret = _mm_mul_ps(ret, invScalar);
- _mm_store_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
-
- inputVal = _mm_srli_si128(inputVal, 4);
- interimVal = _mm_cvtepi8_epi32(inputVal);
- ret = _mm_cvtepi32_ps(interimVal);
- ret = _mm_mul_ps(ret, invScalar);
- _mm_store_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
-
- inputVal = _mm_srli_si128(inputVal, 4);
- interimVal = _mm_cvtepi8_epi32(inputVal);
- ret = _mm_cvtepi32_ps(interimVal);
- ret = _mm_mul_ps(ret, invScalar);
- _mm_store_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
-
- inputVal = _mm_srli_si128(inputVal, 4);
- interimVal = _mm_cvtepi8_epi32(inputVal);
- ret = _mm_cvtepi32_ps(interimVal);
- ret = _mm_mul_ps(ret, invScalar);
- _mm_store_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
-
- inputVectorPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- outputVector[number] = (float)(inputVector[number]) * iScalar;
- }
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ const int8_t* inputVectorPtr = inputVector;
+ __m128 ret;
+ __m128i inputVal;
+ __m128i interimVal;
+
+ for (; number < sixteenthPoints; number++) {
+ inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
+
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_store_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_store_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_store_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_store_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ outputVector[number] = (float)(inputVector[number]) * iScalar;
+ }
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_8i_s32f_convert_32f_neon(float* outputVector, const int8_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_8i_s32f_convert_32f_neon(float* outputVector,
+ const int8_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- float* outputVectorPtr = outputVector;
- const int8_t* inputVectorPtr = inputVector;
-
- const float iScalar = 1.0 / scalar;
- const float32x4_t qiScalar = vdupq_n_f32(iScalar);
-
- int8x8x2_t inputVal;
- float32x4x2_t outputFloat;
- int16x8_t tmp;
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
- for(;number < sixteenthPoints; number++){
- __VOLK_PREFETCH(inputVectorPtr+16);
-
- inputVal = vld2_s8(inputVectorPtr);
- inputVal = vzip_s8(inputVal.val[0], inputVal.val[1]);
- inputVectorPtr += 16;
-
- tmp = vmovl_s8(inputVal.val[0]);
-
- outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp)));
- outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar);
- vst1q_f32(outputVectorPtr, outputFloat.val[0]);
- outputVectorPtr += 4;
-
- outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp)));
- outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar);
- vst1q_f32(outputVectorPtr, outputFloat.val[1]);
- outputVectorPtr += 4;
-
- tmp = vmovl_s8(inputVal.val[1]);
-
- outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp)));
- outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar);
- vst1q_f32(outputVectorPtr, outputFloat.val[0]);
- outputVectorPtr += 4;
-
- outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp)));
- outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar);
- vst1q_f32(outputVectorPtr, outputFloat.val[1]);
- outputVectorPtr += 4;
- }
- for(number = sixteenthPoints * 16; number < num_points; number++){
- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
- }
+ float* outputVectorPtr = outputVector;
+ const int8_t* inputVectorPtr = inputVector;
+
+ const float iScalar = 1.0 / scalar;
+ const float32x4_t qiScalar = vdupq_n_f32(iScalar);
+
+ int8x8x2_t inputVal;
+ float32x4x2_t outputFloat;
+ int16x8_t tmp;
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+ for (; number < sixteenthPoints; number++) {
+ __VOLK_PREFETCH(inputVectorPtr + 16);
+
+ inputVal = vld2_s8(inputVectorPtr);
+ inputVal = vzip_s8(inputVal.val[0], inputVal.val[1]);
+ inputVectorPtr += 16;
+
+ tmp = vmovl_s8(inputVal.val[0]);
+
+ outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp)));
+ outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar);
+ vst1q_f32(outputVectorPtr, outputFloat.val[0]);
+ outputVectorPtr += 4;
+
+ outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp)));
+ outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar);
+ vst1q_f32(outputVectorPtr, outputFloat.val[1]);
+ outputVectorPtr += 4;
+
+ tmp = vmovl_s8(inputVal.val[1]);
+
+ outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp)));
+ outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar);
+ vst1q_f32(outputVectorPtr, outputFloat.val[0]);
+ outputVectorPtr += 4;
+
+ outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp)));
+ outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar);
+ vst1q_f32(outputVectorPtr, outputFloat.val[1]);
+ outputVectorPtr += 4;
+ }
+ for (number = sixteenthPoints * 16; number < num_points; number++) {
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_8i_s32f_convert_32f_a_generic(float* outputVector, const int8_t* inputVector,
- const float scalar, unsigned int num_points)
+static inline void volk_8i_s32f_convert_32f_a_generic(float* outputVector,
+ const int8_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- float* outputVectorPtr = outputVector;
- const int8_t* inputVectorPtr = inputVector;
- unsigned int number = 0;
- const float iScalar = 1.0 / scalar;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
- }
+ float* outputVectorPtr = outputVector;
+ const int8_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ const float iScalar = 1.0 / scalar;
+
+ for (number = 0; number < num_points; number++) {
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC
-extern void
-volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector, const int8_t* inputVector,
- const float scalar, unsigned int num_points);
-
-static inline void
-volk_8i_s32f_convert_32f_u_orc(float* outputVector, const int8_t* inputVector,
- const float scalar, unsigned int num_points)
+extern void volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector,
+ const int8_t* inputVector,
+ const float scalar,
+ unsigned int num_points);
+
+static inline void volk_8i_s32f_convert_32f_u_orc(float* outputVector,
+ const int8_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
{
- float invscalar = 1.0 / scalar;
- volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
+ float invscalar = 1.0 / scalar;
+ volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
}
#endif /* LV_HAVE_ORC */
-
#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */
-
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_8ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_8ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t*
+ * complexVector, unsigned int num_points) \endcode
*
* \b Inputs
* \li complexVector: The complex input vector.
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_8ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer, int16_t* qBuffer,
- const lv_8sc_t* complexVector, unsigned int num_points)
+static inline void volk_8ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer,
+ int16_t* qBuffer,
+ const lv_8sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int8_t* complexVectorPtr = (int8_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
- int16_t* qBufferPtr = qBuffer;
- __m256i MoveMask = _mm256_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
- __m256i complexVal, iOutputVal, qOutputVal;
- __m128i iOutputVal0, qOutputVal0;
-
- unsigned int sixteenthPoints = num_points / 16;
-
- for(number = 0; number < sixteenthPoints; number++){
- complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
-
- complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
- complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
-
- iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
- qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
-
- iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
- iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
-
- qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
- qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
-
- _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
- _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
-
- iBufferPtr += 16;
- qBufferPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
- *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
- }
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ __m256i MoveMask = _mm256_set_epi8(15,
+ 13,
+ 11,
+ 9,
+ 7,
+ 5,
+ 3,
+ 1,
+ 14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0,
+ 15,
+ 13,
+ 11,
+ 9,
+ 7,
+ 5,
+ 3,
+ 1,
+ 14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0);
+ __m256i complexVal, iOutputVal, qOutputVal;
+ __m128i iOutputVal0, qOutputVal0;
+
+ unsigned int sixteenthPoints = num_points / 16;
+
+ for (number = 0; number < sixteenthPoints; number++) {
+ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+
+ complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
+ complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+
+ iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
+ qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
+
+ iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
+ iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
+
+ qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
+ qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
+
+ _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
+ _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
+
+ iBufferPtr += 16;
+ qBufferPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ =
+ ((int16_t)*complexVectorPtr++) *
+ 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
+ *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
-static inline void
-volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16_t* qBuffer,
- const lv_8sc_t* complexVector, unsigned int num_points)
+static inline void volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer,
+ int16_t* qBuffer,
+ const lv_8sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int8_t* complexVectorPtr = (int8_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
- int16_t* qBufferPtr = qBuffer;
- __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); // set 16 byte values
- __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
- __m128i complexVal, iOutputVal, qOutputVal;
-
- unsigned int eighthPoints = num_points / 8;
-
- for(number = 0; number < eighthPoints; number++){
- complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; // aligned load
-
- iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask); // shuffle 16 bytes of 128bit complexVal
- qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
-
- iOutputVal = _mm_cvtepi8_epi16(iOutputVal); // fills 2-byte sign extended versions of lower 8 bytes of input to output
- iOutputVal = _mm_slli_epi16(iOutputVal, 8); // shift in left by 8 bits, each of the 8 16-bit integers, shift in with zeros
-
- qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
- qOutputVal = _mm_slli_epi16(qOutputVal, 8);
-
- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store
- _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
-
- iBufferPtr += 8;
- qBufferPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
- *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
- }
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ __m128i iMoveMask = _mm_set_epi8(0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0); // set 16 byte values
+ __m128i qMoveMask = _mm_set_epi8(
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+ __m128i complexVal, iOutputVal, qOutputVal;
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for (number = 0; number < eighthPoints; number++) {
+ complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
+ complexVectorPtr += 16; // aligned load
+
+ iOutputVal = _mm_shuffle_epi8(complexVal,
+ iMoveMask); // shuffle 16 bytes of 128bit complexVal
+ qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
+
+ iOutputVal = _mm_cvtepi8_epi16(iOutputVal); // fills 2-byte sign extended versions
+ // of lower 8 bytes of input to output
+ iOutputVal =
+ _mm_slli_epi16(iOutputVal, 8); // shift in left by 8 bits, each of the 8
+ // 16-bit integers, shift in with zeros
+
+ qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
+ qOutputVal = _mm_slli_epi16(qOutputVal, 8);
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store
+ _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ =
+ ((int16_t)*complexVectorPtr++) *
+ 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
+ *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+ }
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer, int16_t* qBuffer,
- const lv_8sc_t* complexVector, unsigned int num_points)
+static inline void volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer,
+ int16_t* qBuffer,
+ const lv_8sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int8_t* complexVectorPtr = (int8_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
- int16_t* qBufferPtr = qBuffer;
- __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); // set 16 byte values
- __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
- __m256i complexVal, iOutputVal, qOutputVal;
- __m128i complexVal1, complexVal0;
- __m128i iOutputVal1, iOutputVal0, qOutputVal1, qOutputVal0;
-
- unsigned int sixteenthPoints = num_points / 16;
-
- for(number = 0; number < sixteenthPoints; number++){
- complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; // aligned load
-
- // Extract from complexVal to iOutputVal and qOutputVal
- complexVal1 = _mm256_extractf128_si256(complexVal, 1);
- complexVal0 = _mm256_extractf128_si256(complexVal, 0);
-
- iOutputVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask); // shuffle 16 bytes of 128bit complexVal
- iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask);
- qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask);
- qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask);
-
- iOutputVal1 = _mm_cvtepi8_epi16(iOutputVal1); // fills 2-byte sign extended versions of lower 8 bytes of input to output
- iOutputVal1 = _mm_slli_epi16(iOutputVal1, 8); // shift in left by 8 bits, each of the 8 16-bit integers, shift in with zeros
- iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0);
- iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8);
-
- qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1);
- qOutputVal1 = _mm_slli_epi16(qOutputVal1, 8);
- qOutputVal0 = _mm_cvtepi8_epi16(qOutputVal0);
- qOutputVal0 = _mm_slli_epi16(qOutputVal0, 8);
-
- // Pack iOutputVal0,1 to iOutputVal
- __m256i dummy = _mm256_setzero_si256();
- iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0);
- iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1);
- qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0);
- qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1);
-
- _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); // aligned store
- _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
-
- iBufferPtr += 16;
- qBufferPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
- *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
- }
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ __m128i iMoveMask = _mm_set_epi8(0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0); // set 16 byte values
+ __m128i qMoveMask = _mm_set_epi8(
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+ __m256i complexVal, iOutputVal, qOutputVal;
+ __m128i complexVal1, complexVal0;
+ __m128i iOutputVal1, iOutputVal0, qOutputVal1, qOutputVal0;
+
+ unsigned int sixteenthPoints = num_points / 16;
+
+ for (number = 0; number < sixteenthPoints; number++) {
+ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32; // aligned load
+
+ // Extract from complexVal to iOutputVal and qOutputVal
+ complexVal1 = _mm256_extractf128_si256(complexVal, 1);
+ complexVal0 = _mm256_extractf128_si256(complexVal, 0);
+
+ iOutputVal1 = _mm_shuffle_epi8(
+ complexVal1, iMoveMask); // shuffle 16 bytes of 128bit complexVal
+ iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask);
+ qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask);
+ qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask);
+
+ iOutputVal1 =
+ _mm_cvtepi8_epi16(iOutputVal1); // fills 2-byte sign extended versions of
+ // lower 8 bytes of input to output
+ iOutputVal1 =
+ _mm_slli_epi16(iOutputVal1, 8); // shift in left by 8 bits, each of the 8
+ // 16-bit integers, shift in with zeros
+ iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0);
+ iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8);
+
+ qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1);
+ qOutputVal1 = _mm_slli_epi16(qOutputVal1, 8);
+ qOutputVal0 = _mm_cvtepi8_epi16(qOutputVal0);
+ qOutputVal0 = _mm_slli_epi16(qOutputVal0, 8);
+
+ // Pack iOutputVal0,1 to iOutputVal
+ __m256i dummy = _mm256_setzero_si256();
+ iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0);
+ iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1);
+ qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0);
+ qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1);
+
+ _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); // aligned store
+ _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
+
+ iBufferPtr += 16;
+ qBufferPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ =
+ ((int16_t)*complexVectorPtr++) *
+ 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
+ *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer,
- const lv_8sc_t* complexVector, unsigned int num_points)
+static inline void volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer,
+ int16_t* qBuffer,
+ const lv_8sc_t* complexVector,
+ unsigned int num_points)
{
- const int8_t* complexVectorPtr = (const int8_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
- int16_t* qBufferPtr = qBuffer;
- unsigned int number;
- for(number = 0; number < num_points; number++){
- *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
- *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
- }
+ const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ unsigned int number;
+ for (number = 0; number < num_points; number++) {
+ *iBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256;
+ *qBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256;
+ }
}
#endif /* LV_HAVE_GENERIC */
-
#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a_H */
#ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_u_H
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_8ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer, int16_t* qBuffer,
- const lv_8sc_t* complexVector, unsigned int num_points)
+static inline void volk_8ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
+ int16_t* qBuffer,
+ const lv_8sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int8_t* complexVectorPtr = (int8_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
- int16_t* qBufferPtr = qBuffer;
- __m256i MoveMask = _mm256_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
- __m256i complexVal, iOutputVal, qOutputVal;
- __m128i iOutputVal0, qOutputVal0;
-
- unsigned int sixteenthPoints = num_points / 16;
-
- for(number = 0; number < sixteenthPoints; number++){
- complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
-
- complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
- complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
-
- iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
- qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
-
- iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
- iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
-
- qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
- qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
-
- _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
- _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
-
- iBufferPtr += 16;
- qBufferPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
- *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
- }
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ __m256i MoveMask = _mm256_set_epi8(15,
+ 13,
+ 11,
+ 9,
+ 7,
+ 5,
+ 3,
+ 1,
+ 14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0,
+ 15,
+ 13,
+ 11,
+ 9,
+ 7,
+ 5,
+ 3,
+ 1,
+ 14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0);
+ __m256i complexVal, iOutputVal, qOutputVal;
+ __m128i iOutputVal0, qOutputVal0;
+
+ unsigned int sixteenthPoints = num_points / 16;
+
+ for (number = 0; number < sixteenthPoints; number++) {
+ complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+
+ complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
+ complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+
+ iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
+ qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
+
+ iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
+ iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
+
+ qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
+ qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
+
+ _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
+ _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
+
+ iBufferPtr += 16;
+ qBufferPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ =
+ ((int16_t)*complexVectorPtr++) *
+ 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
+ *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+ }
}
#endif /* LV_HAVE_AVX2 */
#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_u_H */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_8ic_deinterleave_real_16i(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_8ic_deinterleave_real_16i(int16_t* iBuffer, const lv_8sc_t* complexVector,
+ * unsigned int num_points) \endcode
*
* \b Inputs
* \li complexVector: The complex input vector.
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_8ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer, const lv_8sc_t* complexVector,
- unsigned int num_points)
+static inline void volk_8ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer,
+ const lv_8sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int8_t* complexVectorPtr = (int8_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
- __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
- __m256i complexVal, outputVal;
- __m128i outputVal0;
-
- unsigned int sixteenthPoints = num_points / 16;
-
- for(number = 0; number < sixteenthPoints; number++){
- complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
-
- complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
- complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
-
- outputVal0 = _mm256_extractf128_si256(complexVal, 0);
-
- outputVal = _mm256_cvtepi8_epi16(outputVal0);
- outputVal = _mm256_slli_epi16(outputVal, 7);
-
- _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
-
- iBufferPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
- complexVectorPtr++;
- }
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ __m256i moveMask = _mm256_set_epi8(0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0);
+ __m256i complexVal, outputVal;
+ __m128i outputVal0;
+
+ unsigned int sixteenthPoints = num_points / 16;
+
+ for (number = 0; number < sixteenthPoints; number++) {
+ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+
+ complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
+ complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+
+ outputVal0 = _mm256_extractf128_si256(complexVal, 0);
+
+ outputVal = _mm256_cvtepi8_epi16(outputVal0);
+ outputVal = _mm256_slli_epi16(outputVal, 7);
+
+ _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
+
+ iBufferPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
-static inline void
-volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector,
- unsigned int num_points)
+static inline void volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer,
+ const lv_8sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int8_t* complexVectorPtr = (int8_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
- __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
- __m128i complexVal, outputVal;
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ __m128i moveMask = _mm_set_epi8(
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ __m128i complexVal, outputVal;
- unsigned int eighthPoints = num_points / 8;
+ unsigned int eighthPoints = num_points / 8;
- for(number = 0; number < eighthPoints; number++){
- complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ for (number = 0; number < eighthPoints; number++) {
+ complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
+ complexVectorPtr += 16;
- complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+ complexVal = _mm_shuffle_epi8(complexVal, moveMask);
- outputVal = _mm_cvtepi8_epi16(complexVal);
- outputVal = _mm_slli_epi16(outputVal, 7);
+ outputVal = _mm_cvtepi8_epi16(complexVal);
+ outputVal = _mm_slli_epi16(outputVal, 7);
- _mm_store_si128((__m128i*)iBufferPtr, outputVal);
- iBufferPtr += 8;
- }
+ _mm_store_si128((__m128i*)iBufferPtr, outputVal);
+ iBufferPtr += 8;
+ }
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
- complexVectorPtr++;
- }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_8ic_deinterleave_real_16i_a_avx(int16_t* iBuffer, const lv_8sc_t* complexVector,
- unsigned int num_points)
+static inline void volk_8ic_deinterleave_real_16i_a_avx(int16_t* iBuffer,
+ const lv_8sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int8_t* complexVectorPtr = (int8_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
- __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
- __m256i complexVal, outputVal;
- __m128i complexVal1, complexVal0, outputVal1, outputVal0;
-
- unsigned int sixteenthPoints = num_points / 16;
-
- for(number = 0; number < sixteenthPoints; number++){
- complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
-
- complexVal1 = _mm256_extractf128_si256(complexVal, 1);
- complexVal0 = _mm256_extractf128_si256(complexVal, 0);
-
- outputVal1 = _mm_shuffle_epi8(complexVal1, moveMask);
- outputVal0 = _mm_shuffle_epi8(complexVal0, moveMask);
-
- outputVal1 = _mm_cvtepi8_epi16(outputVal1);
- outputVal1 = _mm_slli_epi16(outputVal1, 7);
- outputVal0 = _mm_cvtepi8_epi16(outputVal0);
- outputVal0 = _mm_slli_epi16(outputVal0, 7);
-
- __m256i dummy = _mm256_setzero_si256();
- outputVal = _mm256_insertf128_si256(dummy, outputVal0, 0);
- outputVal = _mm256_insertf128_si256(outputVal, outputVal1, 1);
- _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
-
- iBufferPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
- complexVectorPtr++;
- }
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ __m128i moveMask = _mm_set_epi8(
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ __m256i complexVal, outputVal;
+ __m128i complexVal1, complexVal0, outputVal1, outputVal0;
+
+ unsigned int sixteenthPoints = num_points / 16;
+
+ for (number = 0; number < sixteenthPoints; number++) {
+ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+
+ complexVal1 = _mm256_extractf128_si256(complexVal, 1);
+ complexVal0 = _mm256_extractf128_si256(complexVal, 0);
+
+ outputVal1 = _mm_shuffle_epi8(complexVal1, moveMask);
+ outputVal0 = _mm_shuffle_epi8(complexVal0, moveMask);
+
+ outputVal1 = _mm_cvtepi8_epi16(outputVal1);
+ outputVal1 = _mm_slli_epi16(outputVal1, 7);
+ outputVal0 = _mm_cvtepi8_epi16(outputVal0);
+ outputVal0 = _mm_slli_epi16(outputVal0, 7);
+
+ __m256i dummy = _mm256_setzero_si256();
+ outputVal = _mm256_insertf128_si256(dummy, outputVal0, 0);
+ outputVal = _mm256_insertf128_si256(outputVal, outputVal1, 1);
+ _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
+
+ iBufferPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_8ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_8sc_t* complexVector,
- unsigned int num_points)
+static inline void volk_8ic_deinterleave_real_16i_generic(int16_t* iBuffer,
+ const lv_8sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int8_t* complexVectorPtr = (const int8_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
- for(number = 0; number < num_points; number++){
- *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128;
- complexVectorPtr++;
- }
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ for (number = 0; number < num_points; number++) {
+ *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_8ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer, const lv_8sc_t* complexVector,
- unsigned int num_points)
+static inline void volk_8ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
+ const lv_8sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int8_t* complexVectorPtr = (int8_t*)complexVector;
- int16_t* iBufferPtr = iBuffer;
- __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
- __m256i complexVal, outputVal;
- __m128i outputVal0;
-
- unsigned int sixteenthPoints = num_points / 16;
-
- for(number = 0; number < sixteenthPoints; number++){
- complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
-
- complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
- complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
-
- outputVal0 = _mm256_extractf128_si256(complexVal, 0);
-
- outputVal = _mm256_cvtepi8_epi16(outputVal0);
- outputVal = _mm256_slli_epi16(outputVal, 7);
-
- _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal);
-
- iBufferPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
- complexVectorPtr++;
- }
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ __m256i moveMask = _mm256_set_epi8(0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0);
+ __m256i complexVal, outputVal;
+ __m128i outputVal0;
+
+ unsigned int sixteenthPoints = num_points / 16;
+
+ for (number = 0; number < sixteenthPoints; number++) {
+ complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+
+ complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
+ complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+
+ outputVal0 = _mm256_extractf128_si256(complexVal, 0);
+
+ outputVal = _mm256_cvtepi8_epi16(outputVal0);
+ outputVal = _mm256_slli_epi16(outputVal, 7);
+
+ _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal);
+
+ iBufferPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
#endif /* INCLUDED_volk_8ic_deinterleave_real_16i_u_H */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_8ic_deinterleave_real_8i(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points)
- * \endcode
+ * void volk_8ic_deinterleave_real_8i(int8_t* iBuffer, const lv_8sc_t* complexVector,
+ * unsigned int num_points) \endcode
*
* \b Inputs
* \li complexVector: The complex input vector.
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_8ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, const lv_8sc_t* complexVector,
- unsigned int num_points)
+static inline void volk_8ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer,
+ const lv_8sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int8_t* complexVectorPtr = (int8_t*)complexVector;
- int8_t* iBufferPtr = iBuffer;
- __m256i moveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
- __m256i moveMask2 = _mm256_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
- __m256i complexVal1, complexVal2, outputVal;
-
- unsigned int thirtysecondPoints = num_points / 32;
-
- for(number = 0; number < thirtysecondPoints; number++){
-
- complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
- complexVectorPtr += 32;
- complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
- complexVectorPtr += 32;
-
- complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
- complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
- outputVal = _mm256_or_si256(complexVal1, complexVal2);
- outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
-
- _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
- iBufferPtr += 32;
- }
-
- number = thirtysecondPoints * 32;
- for(; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- complexVectorPtr++;
- }
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ __m256i moveMask1 = _mm256_set_epi8(0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0);
+ __m256i moveMask2 = _mm256_set_epi8(14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80);
+ __m256i complexVal1, complexVal2, outputVal;
+
+ unsigned int thirtysecondPoints = num_points / 32;
+
+ for (number = 0; number < thirtysecondPoints; number++) {
+
+ complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+ complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+
+ complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
+ complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
+ outputVal = _mm256_or_si256(complexVal1, complexVal2);
+ outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
+
+ _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
+ iBufferPtr += 32;
+ }
+
+ number = thirtysecondPoints * 32;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSSE3
#include <tmmintrin.h>
-static inline void
-volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector,
- unsigned int num_points)
+static inline void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer,
+ const lv_8sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int8_t* complexVectorPtr = (int8_t*)complexVector;
- int8_t* iBufferPtr = iBuffer;
- __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
- __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
- __m128i complexVal1, complexVal2, outputVal;
-
- unsigned int sixteenthPoints = num_points / 16;
-
- for(number = 0; number < sixteenthPoints; number++){
- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
-
- complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1);
- complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2);
-
- outputVal = _mm_or_si128(complexVal1, complexVal2);
-
- _mm_store_si128((__m128i*)iBufferPtr, outputVal);
- iBufferPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- complexVectorPtr++;
- }
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ __m128i moveMask1 = _mm_set_epi8(
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ __m128i moveMask2 = _mm_set_epi8(
+ 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+ __m128i complexVal1, complexVal2, outputVal;
+
+ unsigned int sixteenthPoints = num_points / 16;
+
+ for (number = 0; number < sixteenthPoints; number++) {
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
+ complexVectorPtr += 16;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
+ complexVectorPtr += 16;
+
+ complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1);
+ complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2);
+
+ outputVal = _mm_or_si128(complexVal1, complexVal2);
+
+ _mm_store_si128((__m128i*)iBufferPtr, outputVal);
+ iBufferPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_SSSE3 */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void
-volk_8ic_deinterleave_real_8i_a_avx(int8_t* iBuffer, const lv_8sc_t* complexVector,
- unsigned int num_points)
+static inline void volk_8ic_deinterleave_real_8i_a_avx(int8_t* iBuffer,
+ const lv_8sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int8_t* complexVectorPtr = (int8_t*)complexVector;
- int8_t* iBufferPtr = iBuffer;
- __m128i moveMaskL = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
- __m128i moveMaskH = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
- __m256i complexVal1, complexVal2, outputVal;
- __m128i complexVal1H, complexVal1L, complexVal2H, complexVal2L, outputVal1, outputVal2;
-
- unsigned int thirtysecondPoints = num_points / 32;
-
- for(number = 0; number < thirtysecondPoints; number++){
-
- complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
- complexVectorPtr += 32;
- complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
- complexVectorPtr += 32;
-
- complexVal1H = _mm256_extractf128_si256(complexVal1, 1);
- complexVal1L = _mm256_extractf128_si256(complexVal1, 0);
- complexVal2H = _mm256_extractf128_si256(complexVal2, 1);
- complexVal2L = _mm256_extractf128_si256(complexVal2, 0);
-
- complexVal1H = _mm_shuffle_epi8(complexVal1H, moveMaskH);
- complexVal1L = _mm_shuffle_epi8(complexVal1L, moveMaskL);
- outputVal1 = _mm_or_si128(complexVal1H, complexVal1L);
-
-
- complexVal2H = _mm_shuffle_epi8(complexVal2H, moveMaskH);
- complexVal2L = _mm_shuffle_epi8(complexVal2L, moveMaskL);
- outputVal2 = _mm_or_si128(complexVal2H, complexVal2L);
-
- __m256i dummy = _mm256_setzero_si256();
- outputVal = _mm256_insertf128_si256(dummy, outputVal1, 0);
- outputVal = _mm256_insertf128_si256(outputVal, outputVal2, 1);
-
-
- _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
- iBufferPtr += 32;
- }
-
- number = thirtysecondPoints * 32;
- for(; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- complexVectorPtr++;
- }
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ __m128i moveMaskL = _mm_set_epi8(
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ __m128i moveMaskH = _mm_set_epi8(
+ 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+ __m256i complexVal1, complexVal2, outputVal;
+ __m128i complexVal1H, complexVal1L, complexVal2H, complexVal2L, outputVal1,
+ outputVal2;
+
+ unsigned int thirtysecondPoints = num_points / 32;
+
+ for (number = 0; number < thirtysecondPoints; number++) {
+
+ complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+ complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+
+ complexVal1H = _mm256_extractf128_si256(complexVal1, 1);
+ complexVal1L = _mm256_extractf128_si256(complexVal1, 0);
+ complexVal2H = _mm256_extractf128_si256(complexVal2, 1);
+ complexVal2L = _mm256_extractf128_si256(complexVal2, 0);
+
+ complexVal1H = _mm_shuffle_epi8(complexVal1H, moveMaskH);
+ complexVal1L = _mm_shuffle_epi8(complexVal1L, moveMaskL);
+ outputVal1 = _mm_or_si128(complexVal1H, complexVal1L);
+
+
+ complexVal2H = _mm_shuffle_epi8(complexVal2H, moveMaskH);
+ complexVal2L = _mm_shuffle_epi8(complexVal2L, moveMaskL);
+ outputVal2 = _mm_or_si128(complexVal2H, complexVal2L);
+
+ __m256i dummy = _mm256_setzero_si256();
+ outputVal = _mm256_insertf128_si256(dummy, outputVal1, 0);
+ outputVal = _mm256_insertf128_si256(outputVal, outputVal2, 1);
+
+
+ _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
+ iBufferPtr += 32;
+ }
+
+ number = thirtysecondPoints * 32;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_8sc_t* complexVector,
- unsigned int num_points)
+static inline void volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer,
+ const lv_8sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int8_t* complexVectorPtr = (int8_t*)complexVector;
- int8_t* iBufferPtr = iBuffer;
- for(number = 0; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- complexVectorPtr++;
- }
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ for (number = 0; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
-static inline void
-volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points)
+static inline void volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer,
+ const lv_8sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number;
- unsigned int sixteenth_points = num_points / 16;
-
- int8x16x2_t input_vector;
- for(number=0; number < sixteenth_points; ++number) {
- input_vector = vld2q_s8((int8_t*) complexVector );
- vst1q_s8(iBuffer, input_vector.val[0]);
- iBuffer += 16;
- complexVector += 16;
- }
-
- const int8_t* complexVectorPtr = (int8_t*)complexVector;
- int8_t* iBufferPtr = iBuffer;
- for(number = sixteenth_points*16; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- complexVectorPtr++;
- }
+ unsigned int number;
+ unsigned int sixteenth_points = num_points / 16;
+
+ int8x16x2_t input_vector;
+ for (number = 0; number < sixteenth_points; ++number) {
+ input_vector = vld2q_s8((int8_t*)complexVector);
+ vst1q_s8(iBuffer, input_vector.val[0]);
+ iBuffer += 16;
+ complexVector += 16;
+ }
+
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ for (number = sixteenth_points * 16; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_8ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer, const lv_8sc_t* complexVector,
- unsigned int num_points)
+static inline void volk_8ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
+ const lv_8sc_t* complexVector,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int8_t* complexVectorPtr = (int8_t*)complexVector;
- int8_t* iBufferPtr = iBuffer;
- __m256i moveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
- __m256i moveMask2 = _mm256_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
- __m256i complexVal1, complexVal2, outputVal;
-
- unsigned int thirtysecondPoints = num_points / 32;
-
- for(number = 0; number < thirtysecondPoints; number++){
-
- complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
- complexVectorPtr += 32;
- complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
- complexVectorPtr += 32;
-
- complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
- complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
- outputVal = _mm256_or_si256(complexVal1, complexVal2);
- outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
-
- _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal);
- iBufferPtr += 32;
- }
-
- number = thirtysecondPoints * 32;
- for(; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- complexVectorPtr++;
- }
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ __m256i moveMask1 = _mm256_set_epi8(0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0);
+ __m256i moveMask2 = _mm256_set_epi8(14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80);
+ __m256i complexVal1, complexVal2, outputVal;
+
+ unsigned int thirtysecondPoints = num_points / 32;
+
+ for (number = 0; number < thirtysecondPoints; number++) {
+
+ complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+ complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+
+ complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
+ complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
+ outputVal = _mm256_or_si256(complexVal1, complexVal2);
+ outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
+
+ _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal);
+ iBufferPtr += 32;
+ }
+
+ number = thirtysecondPoints * 32;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_8ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points)
- * \endcode
+ * void volk_8ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_8sc_t*
+ * complexVector, const float scalar, unsigned int num_points) \endcode
*
* \b Inputs
* \li complexVector: The complex input vector.
#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
-#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
static inline void
-volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector,
- const float scalar, unsigned int num_points)
+volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer,
+ float* qBuffer,
+ const lv_8sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- float* iBufferPtr = iBuffer;
- float* qBufferPtr = qBuffer;
-
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
- __m128 iFloatValue, qFloatValue;
-
- const float iScalar= 1.0 / scalar;
- __m128 invScalar = _mm_set_ps1(iScalar);
- __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
- int8_t* complexVectorPtr = (int8_t*)complexVector;
-
- __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
- __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
-
- for(;number < eighthPoints; number++){
- complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
- iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
- qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
-
- iIntVal = _mm_cvtepi8_epi32(iComplexVal);
- iFloatValue = _mm_cvtepi32_ps(iIntVal);
- iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
- _mm_store_ps(iBufferPtr, iFloatValue);
- iBufferPtr += 4;
-
- iComplexVal = _mm_srli_si128(iComplexVal, 4);
-
- iIntVal = _mm_cvtepi8_epi32(iComplexVal);
- iFloatValue = _mm_cvtepi32_ps(iIntVal);
- iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
- _mm_store_ps(iBufferPtr, iFloatValue);
- iBufferPtr += 4;
-
- qIntVal = _mm_cvtepi8_epi32(qComplexVal);
- qFloatValue = _mm_cvtepi32_ps(qIntVal);
- qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
- _mm_store_ps(qBufferPtr, qFloatValue);
- qBufferPtr += 4;
-
- qComplexVal = _mm_srli_si128(qComplexVal, 4);
-
- qIntVal = _mm_cvtepi8_epi32(qComplexVal);
- qFloatValue = _mm_cvtepi32_ps(qIntVal);
- qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
- _mm_store_ps(qBufferPtr, qFloatValue);
-
- qBufferPtr += 4;
- }
-
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
- *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
- }
-
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+ __m128 iFloatValue, qFloatValue;
+
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+ __m128i iMoveMask = _mm_set_epi8(
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ __m128i qMoveMask = _mm_set_epi8(
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+
+ for (; number < eighthPoints; number++) {
+ complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
+ complexVectorPtr += 16;
+ iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
+ qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
+
+ iIntVal = _mm_cvtepi8_epi32(iComplexVal);
+ iFloatValue = _mm_cvtepi32_ps(iIntVal);
+ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+ _mm_store_ps(iBufferPtr, iFloatValue);
+ iBufferPtr += 4;
+
+ iComplexVal = _mm_srli_si128(iComplexVal, 4);
+
+ iIntVal = _mm_cvtepi8_epi32(iComplexVal);
+ iFloatValue = _mm_cvtepi32_ps(iIntVal);
+ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+ _mm_store_ps(iBufferPtr, iFloatValue);
+ iBufferPtr += 4;
+
+ qIntVal = _mm_cvtepi8_epi32(qComplexVal);
+ qFloatValue = _mm_cvtepi32_ps(qIntVal);
+ qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
+ _mm_store_ps(qBufferPtr, qFloatValue);
+ qBufferPtr += 4;
+
+ qComplexVal = _mm_srli_si128(qComplexVal, 4);
+
+ qIntVal = _mm_cvtepi8_epi32(qComplexVal);
+ qFloatValue = _mm_cvtepi32_ps(qIntVal);
+ qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
+ _mm_store_ps(qBufferPtr, qFloatValue);
+
+ qBufferPtr += 4;
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+ }
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
-static inline void
-volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer,
- const lv_8sc_t* complexVector,
- const float scalar, unsigned int num_points)
+static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer,
+ float* qBuffer,
+ const lv_8sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- float* iBufferPtr = iBuffer;
- float* qBufferPtr = qBuffer;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
- __m128 cplxValue1, cplxValue2, iValue, qValue;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 cplxValue1, cplxValue2, iValue, qValue;
- __m128 invScalar = _mm_set_ps1(1.0/scalar);
- int8_t* complexVectorPtr = (int8_t*)complexVector;
+ __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
- __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
- for(;number < quarterPoints; number++){
- floatBuffer[0] = (float)(complexVectorPtr[0]);
- floatBuffer[1] = (float)(complexVectorPtr[1]);
- floatBuffer[2] = (float)(complexVectorPtr[2]);
- floatBuffer[3] = (float)(complexVectorPtr[3]);
+ for (; number < quarterPoints; number++) {
+ floatBuffer[0] = (float)(complexVectorPtr[0]);
+ floatBuffer[1] = (float)(complexVectorPtr[1]);
+ floatBuffer[2] = (float)(complexVectorPtr[2]);
+ floatBuffer[3] = (float)(complexVectorPtr[3]);
- floatBuffer[4] = (float)(complexVectorPtr[4]);
- floatBuffer[5] = (float)(complexVectorPtr[5]);
- floatBuffer[6] = (float)(complexVectorPtr[6]);
- floatBuffer[7] = (float)(complexVectorPtr[7]);
+ floatBuffer[4] = (float)(complexVectorPtr[4]);
+ floatBuffer[5] = (float)(complexVectorPtr[5]);
+ floatBuffer[6] = (float)(complexVectorPtr[6]);
+ floatBuffer[7] = (float)(complexVectorPtr[7]);
- cplxValue1 = _mm_load_ps(&floatBuffer[0]);
- cplxValue2 = _mm_load_ps(&floatBuffer[4]);
+ cplxValue1 = _mm_load_ps(&floatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&floatBuffer[4]);
- complexVectorPtr += 8;
+ complexVectorPtr += 8;
- cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
- cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
- // Arrange in i1i2i3i4 format
- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
- _mm_store_ps(iBufferPtr, iValue);
- _mm_store_ps(qBufferPtr, qValue);
+ _mm_store_ps(iBufferPtr, iValue);
+ _mm_store_ps(qBufferPtr, qValue);
- iBufferPtr += 4;
- qBufferPtr += 4;
- }
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
- number = quarterPoints * 4;
- complexVectorPtr = (int8_t*)&complexVector[number];
- for(; number < num_points; number++){
- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
- }
+ number = quarterPoints * 4;
+ complexVectorPtr = (int8_t*)&complexVector[number];
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector,
- const float scalar, unsigned int num_points)
+static inline void volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer,
+ float* qBuffer,
+ const lv_8sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- float* iBufferPtr = iBuffer;
- float* qBufferPtr = qBuffer;
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
- __m256 iFloatValue, qFloatValue;
-
- const float iScalar= 1.0 / scalar;
- __m256 invScalar = _mm256_set1_ps(iScalar);
- __m256i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
- int8_t* complexVectorPtr = (int8_t*)complexVector;
-
- __m256i iMoveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 14, 12, 10, 8, 6, 4, 2, 0,
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 14, 12, 10, 8, 6, 4, 2, 0);
- __m256i qMoveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 15, 13, 11, 9, 7, 5, 3, 1,
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 15, 13, 11, 9, 7, 5, 3, 1);
-
- for(;number < sixteenthPoints; number++){
- complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
- complexVectorPtr += 32;
- iComplexVal = _mm256_shuffle_epi8(complexVal, iMoveMask);
- qComplexVal = _mm256_shuffle_epi8(complexVal, qMoveMask);
-
- iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
- iFloatValue = _mm256_cvtepi32_ps(iIntVal);
- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
- _mm256_store_ps(iBufferPtr, iFloatValue);
- iBufferPtr += 8;
-
- iComplexVal = _mm256_permute4x64_epi64(iComplexVal, 0b11000110);
- iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
- iFloatValue = _mm256_cvtepi32_ps(iIntVal);
- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
- _mm256_store_ps(iBufferPtr, iFloatValue);
- iBufferPtr += 8;
-
- qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
- qFloatValue = _mm256_cvtepi32_ps(qIntVal);
- qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
- _mm256_store_ps(qBufferPtr, qFloatValue);
- qBufferPtr += 8;
-
- qComplexVal = _mm256_permute4x64_epi64(qComplexVal, 0b11000110);
- qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
- qFloatValue = _mm256_cvtepi32_ps(qIntVal);
- qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
- _mm256_store_ps(qBufferPtr, qFloatValue);
- qBufferPtr += 8;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
- *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
- }
-
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+ __m256 iFloatValue, qFloatValue;
+
+ const float iScalar = 1.0 / scalar;
+ __m256 invScalar = _mm256_set1_ps(iScalar);
+ __m256i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+ __m256i iMoveMask = _mm256_set_epi8(0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0);
+ __m256i qMoveMask = _mm256_set_epi8(0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 15,
+ 13,
+ 11,
+ 9,
+ 7,
+ 5,
+ 3,
+ 1,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 15,
+ 13,
+ 11,
+ 9,
+ 7,
+ 5,
+ 3,
+ 1);
+
+ for (; number < sixteenthPoints; number++) {
+ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+ iComplexVal = _mm256_shuffle_epi8(complexVal, iMoveMask);
+ qComplexVal = _mm256_shuffle_epi8(complexVal, qMoveMask);
+
+ iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
+ iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+ _mm256_store_ps(iBufferPtr, iFloatValue);
+ iBufferPtr += 8;
+
+ iComplexVal = _mm256_permute4x64_epi64(iComplexVal, 0b11000110);
+ iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
+ iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+ _mm256_store_ps(iBufferPtr, iFloatValue);
+ iBufferPtr += 8;
+
+ qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
+ qFloatValue = _mm256_cvtepi32_ps(qIntVal);
+ qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
+ _mm256_store_ps(qBufferPtr, qFloatValue);
+ qBufferPtr += 8;
+
+ qComplexVal = _mm256_permute4x64_epi64(qComplexVal, 0b11000110);
+ qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
+ qFloatValue = _mm256_cvtepi32_ps(qIntVal);
+ qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
+ _mm256_store_ps(qBufferPtr, qFloatValue);
+ qBufferPtr += 8;
+ }
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_GENERIC
static inline void
-volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer,
+volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer,
+ float* qBuffer,
const lv_8sc_t* complexVector,
- const float scalar, unsigned int num_points)
+ const float scalar,
+ unsigned int num_points)
{
- const int8_t* complexVectorPtr = (const int8_t*)complexVector;
- float* iBufferPtr = iBuffer;
- float* qBufferPtr = qBuffer;
- unsigned int number;
- const float invScalar = 1.0 / scalar;
- for(number = 0; number < num_points; number++){
- *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
- *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
- }
+ const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+ unsigned int number;
+ const float invScalar = 1.0 / scalar;
+ for (number = 0; number < num_points; number++) {
+ *iBufferPtr++ = (float)(*complexVectorPtr++) * invScalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) * invScalar;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H
#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H
-#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_8ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector,
- const float scalar, unsigned int num_points)
+static inline void volk_8ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer,
+ float* qBuffer,
+ const lv_8sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- float* iBufferPtr = iBuffer;
- float* qBufferPtr = qBuffer;
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
- __m256 iFloatValue, qFloatValue;
-
- const float iScalar= 1.0 / scalar;
- __m256 invScalar = _mm256_set1_ps(iScalar);
- __m256i complexVal, iIntVal, qIntVal;
- __m128i iComplexVal, qComplexVal;
- int8_t* complexVectorPtr = (int8_t*)complexVector;
-
- __m256i MoveMask = _mm256_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8,
- 6, 4, 2, 0,15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
-
- for(;number < sixteenthPoints; number++){
- complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
- complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
- complexVal = _mm256_permute4x64_epi64(complexVal,0xd8);
- iComplexVal = _mm256_extractf128_si256(complexVal,0);
- qComplexVal = _mm256_extractf128_si256(complexVal,1);
-
- iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
- iFloatValue = _mm256_cvtepi32_ps(iIntVal);
- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
- _mm256_storeu_ps(iBufferPtr, iFloatValue);
- iBufferPtr += 8;
-
- qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
- qFloatValue = _mm256_cvtepi32_ps(qIntVal);
- qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
- _mm256_storeu_ps(qBufferPtr, qFloatValue);
- qBufferPtr += 8;
-
- complexVal = _mm256_srli_si256(complexVal, 8);
- iComplexVal = _mm256_extractf128_si256(complexVal,0);
- qComplexVal = _mm256_extractf128_si256(complexVal,1);
-
- iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
- iFloatValue = _mm256_cvtepi32_ps(iIntVal);
- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
- _mm256_storeu_ps(iBufferPtr, iFloatValue);
- iBufferPtr += 8;
-
- qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
- qFloatValue = _mm256_cvtepi32_ps(qIntVal);
- qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
- _mm256_storeu_ps(qBufferPtr, qFloatValue);
- qBufferPtr += 8;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
- *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
- }
-
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+ __m256 iFloatValue, qFloatValue;
+
+ const float iScalar = 1.0 / scalar;
+ __m256 invScalar = _mm256_set1_ps(iScalar);
+ __m256i complexVal, iIntVal, qIntVal;
+ __m128i iComplexVal, qComplexVal;
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+ __m256i MoveMask = _mm256_set_epi8(15,
+ 13,
+ 11,
+ 9,
+ 7,
+ 5,
+ 3,
+ 1,
+ 14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0,
+ 15,
+ 13,
+ 11,
+ 9,
+ 7,
+ 5,
+ 3,
+ 1,
+ 14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0);
+
+ for (; number < sixteenthPoints; number++) {
+ complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+ complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
+ complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+ iComplexVal = _mm256_extractf128_si256(complexVal, 0);
+ qComplexVal = _mm256_extractf128_si256(complexVal, 1);
+
+ iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
+ iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+ _mm256_storeu_ps(iBufferPtr, iFloatValue);
+ iBufferPtr += 8;
+
+ qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
+ qFloatValue = _mm256_cvtepi32_ps(qIntVal);
+ qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
+ _mm256_storeu_ps(qBufferPtr, qFloatValue);
+ qBufferPtr += 8;
+
+ complexVal = _mm256_srli_si256(complexVal, 8);
+ iComplexVal = _mm256_extractf128_si256(complexVal, 0);
+ qComplexVal = _mm256_extractf128_si256(complexVal, 1);
+
+ iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
+ iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+ _mm256_storeu_ps(iBufferPtr, iFloatValue);
+ iBufferPtr += 8;
+
+ qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
+ qFloatValue = _mm256_cvtepi32_ps(qIntVal);
+ qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
+ _mm256_storeu_ps(qBufferPtr, qFloatValue);
+ qBufferPtr += 8;
+ }
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+ }
}
#endif /* LV_HAVE_AVX2 */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_8ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points)
- * \endcode
+ * void volk_8ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_8sc_t* complexVector,
+ * const float scalar, unsigned int num_points) \endcode
*
* \b Inputs
* \li complexVector: The complex input vector.
#ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H
#define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H
-#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void
-volk_8ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_8sc_t* complexVector,
- const float scalar, unsigned int num_points)
+volk_8ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer,
+ const lv_8sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- float* iBufferPtr = iBuffer;
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
- __m256 iFloatValue;
-
- const float iScalar= 1.0 / scalar;
- __m256 invScalar = _mm256_set1_ps(iScalar);
- __m256i complexVal, iIntVal;
- int8_t* complexVectorPtr = (int8_t*)complexVector;
-
- __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 14, 12, 10, 8, 6, 4, 2, 0,
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 14, 12, 10, 8, 6, 4, 2, 0);
- for(;number < sixteenthPoints; number++){
- complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
- complexVectorPtr += 32;
- complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
-
- iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal));
- iFloatValue = _mm256_cvtepi32_ps(iIntVal);
- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
- _mm256_store_ps(iBufferPtr, iFloatValue);
- iBufferPtr += 8;
-
- complexVal = _mm256_permute4x64_epi64(complexVal, 0b11000110);
- iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal));
- iFloatValue = _mm256_cvtepi32_ps(iIntVal);
- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
- _mm256_store_ps(iBufferPtr, iFloatValue);
- iBufferPtr += 8;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
- complexVectorPtr++;
- }
-
+ float* iBufferPtr = iBuffer;
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+ __m256 iFloatValue;
+
+ const float iScalar = 1.0 / scalar;
+ __m256 invScalar = _mm256_set1_ps(iScalar);
+ __m256i complexVal, iIntVal;
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+ __m256i moveMask = _mm256_set_epi8(0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0);
+ for (; number < sixteenthPoints; number++) {
+ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+ complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
+
+ iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal));
+ iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+ _mm256_store_ps(iBufferPtr, iFloatValue);
+ iBufferPtr += 8;
+
+ complexVal = _mm256_permute4x64_epi64(complexVal, 0b11000110);
+ iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal));
+ iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+ _mm256_store_ps(iBufferPtr, iFloatValue);
+ iBufferPtr += 8;
+ }
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
#include <smmintrin.h>
static inline void
-volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_8sc_t* complexVector,
- const float scalar, unsigned int num_points)
+volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer,
+ const lv_8sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- float* iBufferPtr = iBuffer;
-
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
- __m128 iFloatValue;
+ float* iBufferPtr = iBuffer;
- const float iScalar= 1.0 / scalar;
- __m128 invScalar = _mm_set_ps1(iScalar);
- __m128i complexVal, iIntVal;
- int8_t* complexVectorPtr = (int8_t*)complexVector;
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+ __m128 iFloatValue;
- __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ __m128i complexVal, iIntVal;
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
- for(;number < eighthPoints; number++){
- complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
- complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+ __m128i moveMask = _mm_set_epi8(
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
- iIntVal = _mm_cvtepi8_epi32(complexVal);
- iFloatValue = _mm_cvtepi32_ps(iIntVal);
+ for (; number < eighthPoints; number++) {
+ complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
+ complexVectorPtr += 16;
+ complexVal = _mm_shuffle_epi8(complexVal, moveMask);
- iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+ iIntVal = _mm_cvtepi8_epi32(complexVal);
+ iFloatValue = _mm_cvtepi32_ps(iIntVal);
- _mm_store_ps(iBufferPtr, iFloatValue);
+ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
- iBufferPtr += 4;
+ _mm_store_ps(iBufferPtr, iFloatValue);
- complexVal = _mm_srli_si128(complexVal, 4);
- iIntVal = _mm_cvtepi8_epi32(complexVal);
- iFloatValue = _mm_cvtepi32_ps(iIntVal);
+ iBufferPtr += 4;
- iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+ complexVal = _mm_srli_si128(complexVal, 4);
+ iIntVal = _mm_cvtepi8_epi32(complexVal);
+ iFloatValue = _mm_cvtepi32_ps(iIntVal);
- _mm_store_ps(iBufferPtr, iFloatValue);
+ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
- iBufferPtr += 4;
- }
+ _mm_store_ps(iBufferPtr, iFloatValue);
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
- complexVectorPtr++;
- }
+ iBufferPtr += 4;
+ }
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_SSE4_1 */
#include <xmmintrin.h>
static inline void
-volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_8sc_t* complexVector,
- const float scalar, unsigned int num_points)
+volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer,
+ const lv_8sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- float* iBufferPtr = iBuffer;
-
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
- __m128 iValue;
+ float* iBufferPtr = iBuffer;
- const float iScalar= 1.0 / scalar;
- __m128 invScalar = _mm_set_ps1(iScalar);
- int8_t* complexVectorPtr = (int8_t*)complexVector;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 iValue;
- __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
- for(;number < quarterPoints; number++){
- floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
- floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
- floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
- floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
- iValue = _mm_load_ps(floatBuffer);
+ for (; number < quarterPoints; number++) {
+ floatBuffer[0] = (float)(*complexVectorPtr);
+ complexVectorPtr += 2;
+ floatBuffer[1] = (float)(*complexVectorPtr);
+ complexVectorPtr += 2;
+ floatBuffer[2] = (float)(*complexVectorPtr);
+ complexVectorPtr += 2;
+ floatBuffer[3] = (float)(*complexVectorPtr);
+ complexVectorPtr += 2;
- iValue = _mm_mul_ps(iValue, invScalar);
+ iValue = _mm_load_ps(floatBuffer);
- _mm_store_ps(iBufferPtr, iValue);
+ iValue = _mm_mul_ps(iValue, invScalar);
- iBufferPtr += 4;
- }
+ _mm_store_ps(iBufferPtr, iValue);
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
- complexVectorPtr++;
- }
+ iBufferPtr += 4;
+ }
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
static inline void
-volk_8ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_8sc_t* complexVector,
- const float scalar, unsigned int num_points)
+volk_8ic_s32f_deinterleave_real_32f_generic(float* iBuffer,
+ const lv_8sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const int8_t* complexVectorPtr = (const int8_t*)complexVector;
- float* iBufferPtr = iBuffer;
- const float invScalar = 1.0 / scalar;
- for(number = 0; number < num_points; number++){
- *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
- complexVectorPtr++;
- }
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+ float* iBufferPtr = iBuffer;
+ const float invScalar = 1.0 / scalar;
+ for (number = 0; number < num_points; number++) {
+ *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_GENERIC */
-
#endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H */
#ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_u_H
#define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_u_H
-#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
+#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void
-volk_8ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer, const lv_8sc_t* complexVector,
- const float scalar, unsigned int num_points)
+volk_8ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer,
+ const lv_8sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
{
- float* iBufferPtr = iBuffer;
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
- __m256 iFloatValue;
-
- const float iScalar= 1.0 / scalar;
- __m256 invScalar = _mm256_set1_ps(iScalar);
- __m256i complexVal, iIntVal;
- __m128i hcomplexVal;
- int8_t* complexVectorPtr = (int8_t*)complexVector;
-
- __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-
- for(;number < sixteenthPoints; number++){
- complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
- complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
-
- hcomplexVal = _mm256_extracti128_si256(complexVal,0);
- iIntVal = _mm256_cvtepi8_epi32(hcomplexVal);
- iFloatValue = _mm256_cvtepi32_ps(iIntVal);
-
- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
-
- _mm256_storeu_ps(iBufferPtr, iFloatValue);
-
- iBufferPtr += 8;
-
- hcomplexVal = _mm256_extracti128_si256(complexVal,1);
- iIntVal = _mm256_cvtepi8_epi32(hcomplexVal);
- iFloatValue = _mm256_cvtepi32_ps(iIntVal);
-
- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
-
- _mm256_storeu_ps(iBufferPtr, iFloatValue);
-
- iBufferPtr += 8;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
- complexVectorPtr++;
- }
-
+ float* iBufferPtr = iBuffer;
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+ __m256 iFloatValue;
+
+ const float iScalar = 1.0 / scalar;
+ __m256 invScalar = _mm256_set1_ps(iScalar);
+ __m256i complexVal, iIntVal;
+ __m128i hcomplexVal;
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+ __m256i moveMask = _mm256_set_epi8(0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 14,
+ 12,
+ 10,
+ 8,
+ 6,
+ 4,
+ 2,
+ 0);
+
+ for (; number < sixteenthPoints; number++) {
+ complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+ complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
+
+ hcomplexVal = _mm256_extracti128_si256(complexVal, 0);
+ iIntVal = _mm256_cvtepi8_epi32(hcomplexVal);
+ iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+
+ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+
+ _mm256_storeu_ps(iBufferPtr, iFloatValue);
+
+ iBufferPtr += 8;
+
+ hcomplexVal = _mm256_extracti128_si256(complexVal, 1);
+ iIntVal = _mm256_cvtepi8_epi32(hcomplexVal);
+ iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+
+ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+
+ _mm256_storeu_ps(iBufferPtr, iFloatValue);
+
+ iBufferPtr += 8;
+ }
+
+ number = sixteenthPoints * 16;
+ for (; number < num_points; number++) {
+ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+ complexVectorPtr++;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
/*!
- \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
- \param cVector The complex vector where the results will be stored
- \param aVector One of the complex vectors to be multiplied
- \param bVector The complex vector which will be converted to complex conjugate and multiplied
- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ \brief Multiplys the one complex vector with the complex conjugate of the second complex
+ vector and stores their results in the third vector \param cVector The complex vector
+ where the results will be stored \param aVector One of the complex vectors to be
+ multiplied \param bVector The complex vector which will be converted to complex
+ conjugate and multiplied \param num_points The number of complex values in aVector and
+ bVector to be multiplied together and stored into cVector
*/
-static inline void volk_8ic_x2_multiply_conjugate_16ic_a_avx2(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 8;
-
- __m256i x, y, realz, imagz;
- lv_16sc_t* c = cVector;
- const lv_8sc_t* a = aVector;
- const lv_8sc_t* b = bVector;
- __m256i conjugateSign = _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
-
- for(;number < quarterPoints; number++){
- // Convert 8 bit values into 16 bit values
- x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
- y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
-
- // Calculate the ar*cr - ai*(-ci) portions
- realz = _mm256_madd_epi16(x,y);
-
- // Calculate the complex conjugate of the cr + ci j values
- y = _mm256_sign_epi16(y, conjugateSign);
-
- // Shift the order of the cr and ci values
- y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1));
-
- // Calculate the ar*(-ci) + cr*(ai)
- imagz = _mm256_madd_epi16(x,y);
-
- // Perform the addition of products
-
- _mm256_store_si256((__m256i*)c, _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz), _mm256_unpackhi_epi32(realz, imagz)));
-
- a += 8;
- b += 8;
- c += 8;
- }
-
- number = quarterPoints * 8;
- int16_t* c16Ptr = (int16_t*)&cVector[number];
- int8_t* a8Ptr = (int8_t*)&aVector[number];
- int8_t* b8Ptr = (int8_t*)&bVector[number];
- for(; number < num_points; number++){
- float aReal = (float)*a8Ptr++;
- float aImag = (float)*a8Ptr++;
- lv_32fc_t aVal = lv_cmake(aReal, aImag );
- float bReal = (float)*b8Ptr++;
- float bImag = (float)*b8Ptr++;
- lv_32fc_t bVal = lv_cmake( bReal, -bImag );
- lv_32fc_t temp = aVal * bVal;
-
- *c16Ptr++ = (int16_t)lv_creal(temp);
- *c16Ptr++ = (int16_t)lv_cimag(temp);
- }
+static inline void volk_8ic_x2_multiply_conjugate_16ic_a_avx2(lv_16sc_t* cVector,
+ const lv_8sc_t* aVector,
+ const lv_8sc_t* bVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 8;
+
+ __m256i x, y, realz, imagz;
+ lv_16sc_t* c = cVector;
+ const lv_8sc_t* a = aVector;
+ const lv_8sc_t* b = bVector;
+ __m256i conjugateSign =
+ _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
+
+ for (; number < quarterPoints; number++) {
+ // Convert 8 bit values into 16 bit values
+ x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
+ y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
+
+ // Calculate the ar*cr - ai*(-ci) portions
+ realz = _mm256_madd_epi16(x, y);
+
+ // Calculate the complex conjugate of the cr + ci j values
+ y = _mm256_sign_epi16(y, conjugateSign);
+
+ // Shift the order of the cr and ci values
+ y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
+ _MM_SHUFFLE(2, 3, 0, 1));
+
+ // Calculate the ar*(-ci) + cr*(ai)
+ imagz = _mm256_madd_epi16(x, y);
+
+ // Perform the addition of products
+
+ _mm256_store_si256((__m256i*)c,
+ _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz),
+ _mm256_unpackhi_epi32(realz, imagz)));
+
+ a += 8;
+ b += 8;
+ c += 8;
+ }
+
+ number = quarterPoints * 8;
+ int16_t* c16Ptr = (int16_t*)&cVector[number];
+ int8_t* a8Ptr = (int8_t*)&aVector[number];
+ int8_t* b8Ptr = (int8_t*)&bVector[number];
+ for (; number < num_points; number++) {
+ float aReal = (float)*a8Ptr++;
+ float aImag = (float)*a8Ptr++;
+ lv_32fc_t aVal = lv_cmake(aReal, aImag);
+ float bReal = (float)*b8Ptr++;
+ float bImag = (float)*b8Ptr++;
+ lv_32fc_t bVal = lv_cmake(bReal, -bImag);
+ lv_32fc_t temp = aVal * bVal;
+
+ *c16Ptr++ = (int16_t)lv_creal(temp);
+ *c16Ptr++ = (int16_t)lv_cimag(temp);
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
/*!
- \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
- \param cVector The complex vector where the results will be stored
- \param aVector One of the complex vectors to be multiplied
- \param bVector The complex vector which will be converted to complex conjugate and multiplied
- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ \brief Multiplys the one complex vector with the complex conjugate of the second complex
+ vector and stores their results in the third vector \param cVector The complex vector
+ where the results will be stored \param aVector One of the complex vectors to be
+ multiplied \param bVector The complex vector which will be converted to complex
+ conjugate and multiplied \param num_points The number of complex values in aVector and
+ bVector to be multiplied together and stored into cVector
*/
-static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- __m128i x, y, realz, imagz;
- lv_16sc_t* c = cVector;
- const lv_8sc_t* a = aVector;
- const lv_8sc_t* b = bVector;
- __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
-
- for(;number < quarterPoints; number++){
- // Convert into 8 bit values into 16 bit values
- x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
- y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
-
- // Calculate the ar*cr - ai*(-ci) portions
- realz = _mm_madd_epi16(x,y);
-
- // Calculate the complex conjugate of the cr + ci j values
- y = _mm_sign_epi16(y, conjugateSign);
-
- // Shift the order of the cr and ci values
- y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1));
-
- // Calculate the ar*(-ci) + cr*(ai)
- imagz = _mm_madd_epi16(x,y);
-
- _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz)));
-
- a += 4;
- b += 4;
- c += 4;
- }
-
- number = quarterPoints * 4;
- int16_t* c16Ptr = (int16_t*)&cVector[number];
- int8_t* a8Ptr = (int8_t*)&aVector[number];
- int8_t* b8Ptr = (int8_t*)&bVector[number];
- for(; number < num_points; number++){
- float aReal = (float)*a8Ptr++;
- float aImag = (float)*a8Ptr++;
- lv_32fc_t aVal = lv_cmake(aReal, aImag );
- float bReal = (float)*b8Ptr++;
- float bImag = (float)*b8Ptr++;
- lv_32fc_t bVal = lv_cmake( bReal, -bImag );
- lv_32fc_t temp = aVal * bVal;
-
- *c16Ptr++ = (int16_t)lv_creal(temp);
- *c16Ptr++ = (int16_t)lv_cimag(temp);
- }
+static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVector,
+ const lv_8sc_t* aVector,
+ const lv_8sc_t* bVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128i x, y, realz, imagz;
+ lv_16sc_t* c = cVector;
+ const lv_8sc_t* a = aVector;
+ const lv_8sc_t* b = bVector;
+ __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
+
+ for (; number < quarterPoints; number++) {
+ // Convert into 8 bit values into 16 bit values
+ x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
+ y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
+
+ // Calculate the ar*cr - ai*(-ci) portions
+ realz = _mm_madd_epi16(x, y);
+
+ // Calculate the complex conjugate of the cr + ci j values
+ y = _mm_sign_epi16(y, conjugateSign);
+
+ // Shift the order of the cr and ci values
+ y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
+ _MM_SHUFFLE(2, 3, 0, 1));
+
+ // Calculate the ar*(-ci) + cr*(ai)
+ imagz = _mm_madd_epi16(x, y);
+
+ _mm_store_si128((__m128i*)c,
+ _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz),
+ _mm_unpackhi_epi32(realz, imagz)));
+
+ a += 4;
+ b += 4;
+ c += 4;
+ }
+
+ number = quarterPoints * 4;
+ int16_t* c16Ptr = (int16_t*)&cVector[number];
+ int8_t* a8Ptr = (int8_t*)&aVector[number];
+ int8_t* b8Ptr = (int8_t*)&bVector[number];
+ for (; number < num_points; number++) {
+ float aReal = (float)*a8Ptr++;
+ float aImag = (float)*a8Ptr++;
+ lv_32fc_t aVal = lv_cmake(aReal, aImag);
+ float bReal = (float)*b8Ptr++;
+ float bImag = (float)*b8Ptr++;
+ lv_32fc_t bVal = lv_cmake(bReal, -bImag);
+ lv_32fc_t temp = aVal * bVal;
+
+ *c16Ptr++ = (int16_t)lv_creal(temp);
+ *c16Ptr++ = (int16_t)lv_cimag(temp);
+ }
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
/*!
- \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
- \param cVector The complex vector where the results will be stored
- \param aVector One of the complex vectors to be multiplied
- \param bVector The complex vector which will be converted to complex conjugate and multiplied
- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ \brief Multiplys the one complex vector with the complex conjugate of the second complex
+ vector and stores their results in the third vector \param cVector The complex vector
+ where the results will be stored \param aVector One of the complex vectors to be
+ multiplied \param bVector The complex vector which will be converted to complex
+ conjugate and multiplied \param num_points The number of complex values in aVector and
+ bVector to be multiplied together and stored into cVector
*/
-static inline void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
- unsigned int number = 0;
- int16_t* c16Ptr = (int16_t*)cVector;
- int8_t* a8Ptr = (int8_t*)aVector;
- int8_t* b8Ptr = (int8_t*)bVector;
- for(number =0; number < num_points; number++){
- float aReal = (float)*a8Ptr++;
- float aImag = (float)*a8Ptr++;
- lv_32fc_t aVal = lv_cmake(aReal, aImag );
- float bReal = (float)*b8Ptr++;
- float bImag = (float)*b8Ptr++;
- lv_32fc_t bVal = lv_cmake( bReal, -bImag );
- lv_32fc_t temp = aVal * bVal;
-
- *c16Ptr++ = (int16_t)lv_creal(temp);
- *c16Ptr++ = (int16_t)lv_cimag(temp);
- }
+static inline void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t* cVector,
+ const lv_8sc_t* aVector,
+ const lv_8sc_t* bVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ int16_t* c16Ptr = (int16_t*)cVector;
+ int8_t* a8Ptr = (int8_t*)aVector;
+ int8_t* b8Ptr = (int8_t*)bVector;
+ for (number = 0; number < num_points; number++) {
+ float aReal = (float)*a8Ptr++;
+ float aImag = (float)*a8Ptr++;
+ lv_32fc_t aVal = lv_cmake(aReal, aImag);
+ float bReal = (float)*b8Ptr++;
+ float bImag = (float)*b8Ptr++;
+ lv_32fc_t bVal = lv_cmake(bReal, -bImag);
+ lv_32fc_t temp = aVal * bVal;
+
+ *c16Ptr++ = (int16_t)lv_creal(temp);
+ *c16Ptr++ = (int16_t)lv_cimag(temp);
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
/*!
- \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
- \param cVector The complex vector where the results will be stored
- \param aVector One of the complex vectors to be multiplied
- \param bVector The complex vector which will be converted to complex conjugate and multiplied
- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ \brief Multiplys the one complex vector with the complex conjugate of the second complex
+ vector and stores their results in the third vector \param cVector The complex vector
+ where the results will be stored \param aVector One of the complex vectors to be
+ multiplied \param bVector The complex vector which will be converted to complex
+ conjugate and multiplied \param num_points The number of complex values in aVector and
+ bVector to be multiplied together and stored into cVector
*/
-static inline void volk_8ic_x2_multiply_conjugate_16ic_u_avx2(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int oneEigthPoints = num_points / 8;
-
- __m256i x, y, realz, imagz;
- lv_16sc_t* c = cVector;
- const lv_8sc_t* a = aVector;
- const lv_8sc_t* b = bVector;
- __m256i conjugateSign = _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
-
- for(;number < oneEigthPoints; number++){
- // Convert 8 bit values into 16 bit values
- x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
- y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
-
- // Calculate the ar*cr - ai*(-ci) portions
- realz = _mm256_madd_epi16(x,y);
-
- // Calculate the complex conjugate of the cr + ci j values
- y = _mm256_sign_epi16(y, conjugateSign);
-
- // Shift the order of the cr and ci values
- y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1));
-
- // Calculate the ar*(-ci) + cr*(ai)
- imagz = _mm256_madd_epi16(x,y);
-
- // Perform the addition of products
-
- _mm256_storeu_si256((__m256i*)c, _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz), _mm256_unpackhi_epi32(realz, imagz)));
-
- a += 8;
- b += 8;
- c += 8;
- }
-
- number = oneEigthPoints * 8;
- int16_t* c16Ptr = (int16_t*)&cVector[number];
- int8_t* a8Ptr = (int8_t*)&aVector[number];
- int8_t* b8Ptr = (int8_t*)&bVector[number];
- for(; number < num_points; number++){
- float aReal = (float)*a8Ptr++;
- float aImag = (float)*a8Ptr++;
- lv_32fc_t aVal = lv_cmake(aReal, aImag );
- float bReal = (float)*b8Ptr++;
- float bImag = (float)*b8Ptr++;
- lv_32fc_t bVal = lv_cmake( bReal, -bImag );
- lv_32fc_t temp = aVal * bVal;
-
- *c16Ptr++ = (int16_t)lv_creal(temp);
- *c16Ptr++ = (int16_t)lv_cimag(temp);
- }
+static inline void volk_8ic_x2_multiply_conjugate_16ic_u_avx2(lv_16sc_t* cVector,
+ const lv_8sc_t* aVector,
+ const lv_8sc_t* bVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int oneEigthPoints = num_points / 8;
+
+ __m256i x, y, realz, imagz;
+ lv_16sc_t* c = cVector;
+ const lv_8sc_t* a = aVector;
+ const lv_8sc_t* b = bVector;
+ __m256i conjugateSign =
+ _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
+
+ for (; number < oneEigthPoints; number++) {
+ // Convert 8 bit values into 16 bit values
+ x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
+ y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
+
+ // Calculate the ar*cr - ai*(-ci) portions
+ realz = _mm256_madd_epi16(x, y);
+
+ // Calculate the complex conjugate of the cr + ci j values
+ y = _mm256_sign_epi16(y, conjugateSign);
+
+ // Shift the order of the cr and ci values
+ y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
+ _MM_SHUFFLE(2, 3, 0, 1));
+
+ // Calculate the ar*(-ci) + cr*(ai)
+ imagz = _mm256_madd_epi16(x, y);
+
+ // Perform the addition of products
+
+ _mm256_storeu_si256((__m256i*)c,
+ _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz),
+ _mm256_unpackhi_epi32(realz, imagz)));
+
+ a += 8;
+ b += 8;
+ c += 8;
+ }
+
+ number = oneEigthPoints * 8;
+ int16_t* c16Ptr = (int16_t*)&cVector[number];
+ int8_t* a8Ptr = (int8_t*)&aVector[number];
+ int8_t* b8Ptr = (int8_t*)&bVector[number];
+ for (; number < num_points; number++) {
+ float aReal = (float)*a8Ptr++;
+ float aImag = (float)*a8Ptr++;
+ lv_32fc_t aVal = lv_cmake(aReal, aImag);
+ float bReal = (float)*b8Ptr++;
+ float bImag = (float)*b8Ptr++;
+ lv_32fc_t bVal = lv_cmake(bReal, -bImag);
+ lv_32fc_t temp = aVal * bVal;
+
+ *c16Ptr++ = (int16_t)lv_creal(temp);
+ *c16Ptr++ = (int16_t)lv_cimag(temp);
+ }
}
#endif /* LV_HAVE_AVX2 */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_8ic_x2_s32f_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points)
- * \endcode
+ * void volk_8ic_x2_s32f_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_8sc_t*
+ * aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points) \endcode
*
* \b Inputs
* \li aVector: One of the complex vectors to be multiplied.
- * \li bVector: The complex vector which will be converted to complex conjugate and multiplied.
- * \li scalar: each output value is scaled by 1/scalar.
- * \li num_points: The number of complex values in aVector and bVector to be multiplied together and stored into cVector.
+ * \li bVector: The complex vector which will be converted to complex conjugate and
+ * multiplied. \li scalar: each output value is scaled by 1/scalar. \li num_points: The
+ * number of complex values in aVector and bVector to be multiplied together and stored
+ * into cVector.
*
* \b Outputs
* \li cVector: The complex vector where the results will be stored.
#include <immintrin.h>
static inline void
-volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(lv_32fc_t* cVector, const lv_8sc_t* aVector,
- const lv_8sc_t* bVector, const float scalar,
- unsigned int num_points)
+volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(lv_32fc_t* cVector,
+ const lv_8sc_t* aVector,
+ const lv_8sc_t* bVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int oneEigthPoints = num_points / 8;
-
- __m256i x, y, realz, imagz;
- __m256 ret, retlo, rethi;
- lv_32fc_t* c = cVector;
- const lv_8sc_t* a = aVector;
- const lv_8sc_t* b = bVector;
- __m256i conjugateSign = _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
-
- __m256 invScalar = _mm256_set1_ps(1.0/scalar);
-
- for(;number < oneEigthPoints; number++){
- // Convert 8 bit values into 16 bit values
- x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
- y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
-
- // Calculate the ar*cr - ai*(-ci) portions
- realz = _mm256_madd_epi16(x,y);
-
- // Calculate the complex conjugate of the cr + ci j values
- y = _mm256_sign_epi16(y, conjugateSign);
-
- // Shift the order of the cr and ci values
- y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1));
-
- // Calculate the ar*(-ci) + cr*(ai)
- imagz = _mm256_madd_epi16(x,y);
-
- // Interleave real and imaginary and then convert to float values
- retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
-
- // Normalize the floating point values
- retlo = _mm256_mul_ps(retlo, invScalar);
-
- // Interleave real and imaginary and then convert to float values
- rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
-
- // Normalize the floating point values
- rethi = _mm256_mul_ps(rethi, invScalar);
-
- ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
- _mm256_store_ps((float*)c, ret);
- c += 4;
-
- ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
- _mm256_store_ps((float*)c, ret);
- c += 4;
-
- a += 8;
- b += 8;
- }
-
- number = oneEigthPoints * 8;
- float* cFloatPtr = (float*)&cVector[number];
- int8_t* a8Ptr = (int8_t*)&aVector[number];
- int8_t* b8Ptr = (int8_t*)&bVector[number];
- for(; number < num_points; number++){
- float aReal = (float)*a8Ptr++;
- float aImag = (float)*a8Ptr++;
- lv_32fc_t aVal = lv_cmake(aReal, aImag );
- float bReal = (float)*b8Ptr++;
- float bImag = (float)*b8Ptr++;
- lv_32fc_t bVal = lv_cmake( bReal, -bImag );
- lv_32fc_t temp = aVal * bVal;
-
- *cFloatPtr++ = lv_creal(temp) / scalar;
- *cFloatPtr++ = lv_cimag(temp) / scalar;
- }
+ unsigned int number = 0;
+ const unsigned int oneEigthPoints = num_points / 8;
+
+ __m256i x, y, realz, imagz;
+ __m256 ret, retlo, rethi;
+ lv_32fc_t* c = cVector;
+ const lv_8sc_t* a = aVector;
+ const lv_8sc_t* b = bVector;
+ __m256i conjugateSign =
+ _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
+
+ __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
+
+ for (; number < oneEigthPoints; number++) {
+ // Convert 8 bit values into 16 bit values
+ x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
+ y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
+
+ // Calculate the ar*cr - ai*(-ci) portions
+ realz = _mm256_madd_epi16(x, y);
+
+ // Calculate the complex conjugate of the cr + ci j values
+ y = _mm256_sign_epi16(y, conjugateSign);
+
+ // Shift the order of the cr and ci values
+ y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
+ _MM_SHUFFLE(2, 3, 0, 1));
+
+ // Calculate the ar*(-ci) + cr*(ai)
+ imagz = _mm256_madd_epi16(x, y);
+
+ // Interleave real and imaginary and then convert to float values
+ retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
+
+ // Normalize the floating point values
+ retlo = _mm256_mul_ps(retlo, invScalar);
+
+ // Interleave real and imaginary and then convert to float values
+ rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
+
+ // Normalize the floating point values
+ rethi = _mm256_mul_ps(rethi, invScalar);
+
+ ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
+ _mm256_store_ps((float*)c, ret);
+ c += 4;
+
+ ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
+ _mm256_store_ps((float*)c, ret);
+ c += 4;
+
+ a += 8;
+ b += 8;
+ }
+
+ number = oneEigthPoints * 8;
+ float* cFloatPtr = (float*)&cVector[number];
+ int8_t* a8Ptr = (int8_t*)&aVector[number];
+ int8_t* b8Ptr = (int8_t*)&bVector[number];
+ for (; number < num_points; number++) {
+ float aReal = (float)*a8Ptr++;
+ float aImag = (float)*a8Ptr++;
+ lv_32fc_t aVal = lv_cmake(aReal, aImag);
+ float bReal = (float)*b8Ptr++;
+ float bImag = (float)*b8Ptr++;
+ lv_32fc_t bVal = lv_cmake(bReal, -bImag);
+ lv_32fc_t temp = aVal * bVal;
+
+ *cFloatPtr++ = lv_creal(temp) / scalar;
+ *cFloatPtr++ = lv_cimag(temp) / scalar;
+ }
}
-#endif /* LV_HAVE_AVX2*/
+#endif /* LV_HAVE_AVX2*/
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
static inline void
-volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector,
- const lv_8sc_t* bVector, const float scalar,
+volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector,
+ const lv_8sc_t* aVector,
+ const lv_8sc_t* bVector,
+ const float scalar,
unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- __m128i x, y, realz, imagz;
- __m128 ret;
- lv_32fc_t* c = cVector;
- const lv_8sc_t* a = aVector;
- const lv_8sc_t* b = bVector;
- __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
-
- __m128 invScalar = _mm_set_ps1(1.0/scalar);
-
- for(;number < quarterPoints; number++){
- // Convert into 8 bit values into 16 bit values
- x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
- y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
-
- // Calculate the ar*cr - ai*(-ci) portions
- realz = _mm_madd_epi16(x,y);
-
- // Calculate the complex conjugate of the cr + ci j values
- y = _mm_sign_epi16(y, conjugateSign);
-
- // Shift the order of the cr and ci values
- y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1));
-
- // Calculate the ar*(-ci) + cr*(ai)
- imagz = _mm_madd_epi16(x,y);
-
- // Interleave real and imaginary and then convert to float values
- ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
-
- // Normalize the floating point values
- ret = _mm_mul_ps(ret, invScalar);
-
- // Store the floating point values
- _mm_store_ps((float*)c, ret);
- c += 2;
-
- // Interleave real and imaginary and then convert to float values
- ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
-
- // Normalize the floating point values
- ret = _mm_mul_ps(ret, invScalar);
-
- // Store the floating point values
- _mm_store_ps((float*)c, ret);
- c += 2;
-
- a += 4;
- b += 4;
- }
-
- number = quarterPoints * 4;
- float* cFloatPtr = (float*)&cVector[number];
- int8_t* a8Ptr = (int8_t*)&aVector[number];
- int8_t* b8Ptr = (int8_t*)&bVector[number];
- for(; number < num_points; number++){
- float aReal = (float)*a8Ptr++;
- float aImag = (float)*a8Ptr++;
- lv_32fc_t aVal = lv_cmake(aReal, aImag );
- float bReal = (float)*b8Ptr++;
- float bImag = (float)*b8Ptr++;
- lv_32fc_t bVal = lv_cmake( bReal, -bImag );
- lv_32fc_t temp = aVal * bVal;
-
- *cFloatPtr++ = lv_creal(temp) / scalar;
- *cFloatPtr++ = lv_cimag(temp) / scalar;
- }
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128i x, y, realz, imagz;
+ __m128 ret;
+ lv_32fc_t* c = cVector;
+ const lv_8sc_t* a = aVector;
+ const lv_8sc_t* b = bVector;
+ __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
+
+ __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+
+ for (; number < quarterPoints; number++) {
+ // Convert into 8 bit values into 16 bit values
+ x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
+ y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
+
+ // Calculate the ar*cr - ai*(-ci) portions
+ realz = _mm_madd_epi16(x, y);
+
+ // Calculate the complex conjugate of the cr + ci j values
+ y = _mm_sign_epi16(y, conjugateSign);
+
+ // Shift the order of the cr and ci values
+ y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
+ _MM_SHUFFLE(2, 3, 0, 1));
+
+ // Calculate the ar*(-ci) + cr*(ai)
+ imagz = _mm_madd_epi16(x, y);
+
+ // Interleave real and imaginary and then convert to float values
+ ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
+
+ // Normalize the floating point values
+ ret = _mm_mul_ps(ret, invScalar);
+
+ // Store the floating point values
+ _mm_store_ps((float*)c, ret);
+ c += 2;
+
+ // Interleave real and imaginary and then convert to float values
+ ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
+
+ // Normalize the floating point values
+ ret = _mm_mul_ps(ret, invScalar);
+
+ // Store the floating point values
+ _mm_store_ps((float*)c, ret);
+ c += 2;
+
+ a += 4;
+ b += 4;
+ }
+
+ number = quarterPoints * 4;
+ float* cFloatPtr = (float*)&cVector[number];
+ int8_t* a8Ptr = (int8_t*)&aVector[number];
+ int8_t* b8Ptr = (int8_t*)&bVector[number];
+ for (; number < num_points; number++) {
+ float aReal = (float)*a8Ptr++;
+ float aImag = (float)*a8Ptr++;
+ lv_32fc_t aVal = lv_cmake(aReal, aImag);
+ float bReal = (float)*b8Ptr++;
+ float bImag = (float)*b8Ptr++;
+ lv_32fc_t bVal = lv_cmake(bReal, -bImag);
+ lv_32fc_t temp = aVal * bVal;
+
+ *cFloatPtr++ = lv_creal(temp) / scalar;
+ *cFloatPtr++ = lv_cimag(temp) / scalar;
+ }
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
static inline void
-volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector,
- const lv_8sc_t* bVector, const float scalar,
+volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector,
+ const lv_8sc_t* aVector,
+ const lv_8sc_t* bVector,
+ const float scalar,
unsigned int num_points)
{
- unsigned int number = 0;
- float* cPtr = (float*)cVector;
- const float invScalar = 1.0 / scalar;
- int8_t* a8Ptr = (int8_t*)aVector;
- int8_t* b8Ptr = (int8_t*)bVector;
- for(number = 0; number < num_points; number++){
- float aReal = (float)*a8Ptr++;
- float aImag = (float)*a8Ptr++;
- lv_32fc_t aVal = lv_cmake(aReal, aImag );
- float bReal = (float)*b8Ptr++;
- float bImag = (float)*b8Ptr++;
- lv_32fc_t bVal = lv_cmake( bReal, -bImag );
- lv_32fc_t temp = aVal * bVal;
-
- *cPtr++ = (lv_creal(temp) * invScalar);
- *cPtr++ = (lv_cimag(temp) * invScalar);
- }
+ unsigned int number = 0;
+ float* cPtr = (float*)cVector;
+ const float invScalar = 1.0 / scalar;
+ int8_t* a8Ptr = (int8_t*)aVector;
+ int8_t* b8Ptr = (int8_t*)bVector;
+ for (number = 0; number < num_points; number++) {
+ float aReal = (float)*a8Ptr++;
+ float aImag = (float)*a8Ptr++;
+ lv_32fc_t aVal = lv_cmake(aReal, aImag);
+ float bReal = (float)*b8Ptr++;
+ float bImag = (float)*b8Ptr++;
+ lv_32fc_t bVal = lv_cmake(bReal, -bImag);
+ lv_32fc_t temp = aVal * bVal;
+
+ *cPtr++ = (lv_creal(temp) * invScalar);
+ *cPtr++ = (lv_cimag(temp) * invScalar);
+ }
}
#endif /* LV_HAVE_GENERIC */
#include <immintrin.h>
static inline void
-volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector, const lv_8sc_t* aVector,
- const lv_8sc_t* bVector, const float scalar,
- unsigned int num_points)
+volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector,
+ const lv_8sc_t* aVector,
+ const lv_8sc_t* bVector,
+ const float scalar,
+ unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int oneEigthPoints = num_points / 8;
-
- __m256i x, y, realz, imagz;
- __m256 ret, retlo, rethi;
- lv_32fc_t* c = cVector;
- const lv_8sc_t* a = aVector;
- const lv_8sc_t* b = bVector;
- __m256i conjugateSign = _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
-
- __m256 invScalar = _mm256_set1_ps(1.0/scalar);
-
- for(;number < oneEigthPoints; number++){
- // Convert 8 bit values into 16 bit values
- x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
- y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
-
- // Calculate the ar*cr - ai*(-ci) portions
- realz = _mm256_madd_epi16(x,y);
-
- // Calculate the complex conjugate of the cr + ci j values
- y = _mm256_sign_epi16(y, conjugateSign);
-
- // Shift the order of the cr and ci values
- y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1));
-
- // Calculate the ar*(-ci) + cr*(ai)
- imagz = _mm256_madd_epi16(x,y);
-
- // Interleave real and imaginary and then convert to float values
- retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
-
- // Normalize the floating point values
- retlo = _mm256_mul_ps(retlo, invScalar);
-
- // Interleave real and imaginary and then convert to float values
- rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
-
- // Normalize the floating point values
- rethi = _mm256_mul_ps(rethi, invScalar);
-
- ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
- _mm256_storeu_ps((float*)c, ret);
- c += 4;
-
- ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
- _mm256_storeu_ps((float*)c, ret);
- c += 4;
-
- a += 8;
- b += 8;
- }
-
- number = oneEigthPoints * 8;
- float* cFloatPtr = (float*)&cVector[number];
- int8_t* a8Ptr = (int8_t*)&aVector[number];
- int8_t* b8Ptr = (int8_t*)&bVector[number];
- for(; number < num_points; number++){
- float aReal = (float)*a8Ptr++;
- float aImag = (float)*a8Ptr++;
- lv_32fc_t aVal = lv_cmake(aReal, aImag );
- float bReal = (float)*b8Ptr++;
- float bImag = (float)*b8Ptr++;
- lv_32fc_t bVal = lv_cmake( bReal, -bImag );
- lv_32fc_t temp = aVal * bVal;
-
- *cFloatPtr++ = lv_creal(temp) / scalar;
- *cFloatPtr++ = lv_cimag(temp) / scalar;
- }
+ unsigned int number = 0;
+ const unsigned int oneEigthPoints = num_points / 8;
+
+ __m256i x, y, realz, imagz;
+ __m256 ret, retlo, rethi;
+ lv_32fc_t* c = cVector;
+ const lv_8sc_t* a = aVector;
+ const lv_8sc_t* b = bVector;
+ __m256i conjugateSign =
+ _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
+
+ __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
+
+ for (; number < oneEigthPoints; number++) {
+ // Convert 8 bit values into 16 bit values
+ x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
+ y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
+
+ // Calculate the ar*cr - ai*(-ci) portions
+ realz = _mm256_madd_epi16(x, y);
+
+ // Calculate the complex conjugate of the cr + ci j values
+ y = _mm256_sign_epi16(y, conjugateSign);
+
+ // Shift the order of the cr and ci values
+ y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
+ _MM_SHUFFLE(2, 3, 0, 1));
+
+ // Calculate the ar*(-ci) + cr*(ai)
+ imagz = _mm256_madd_epi16(x, y);
+
+ // Interleave real and imaginary and then convert to float values
+ retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
+
+ // Normalize the floating point values
+ retlo = _mm256_mul_ps(retlo, invScalar);
+
+ // Interleave real and imaginary and then convert to float values
+ rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
+
+ // Normalize the floating point values
+ rethi = _mm256_mul_ps(rethi, invScalar);
+
+ ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
+ _mm256_storeu_ps((float*)c, ret);
+ c += 4;
+
+ ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
+ _mm256_storeu_ps((float*)c, ret);
+ c += 4;
+
+ a += 8;
+ b += 8;
+ }
+
+ number = oneEigthPoints * 8;
+ float* cFloatPtr = (float*)&cVector[number];
+ int8_t* a8Ptr = (int8_t*)&aVector[number];
+ int8_t* b8Ptr = (int8_t*)&bVector[number];
+ for (; number < num_points; number++) {
+ float aReal = (float)*a8Ptr++;
+ float aImag = (float)*a8Ptr++;
+ lv_32fc_t aVal = lv_cmake(aReal, aImag);
+ float bReal = (float)*b8Ptr++;
+ float bImag = (float)*b8Ptr++;
+ lv_32fc_t bVal = lv_cmake(bReal, -bImag);
+ lv_32fc_t temp = aVal * bVal;
+
+ *cFloatPtr++ = lv_creal(temp) / scalar;
+ *cFloatPtr++ = lv_cimag(temp) / scalar;
+ }
}
-#endif /* LV_HAVE_AVX2*/
+#endif /* LV_HAVE_AVX2*/
#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H */
#ifndef INCLUDED_volk_8u_conv_k7_r2puppet_8u_H
#define INCLUDED_volk_8u_conv_k7_r2puppet_8u_H
+#include <string.h>
#include <volk/volk.h>
#include <volk/volk_8u_x4_conv_k7_r2_8u.h>
-#include <string.h>
typedef union {
- //decision_t is a BIT vector
- unsigned char* t;
- unsigned int* w;
+ // decision_t is a BIT vector
+ unsigned char* t;
+ unsigned int* w;
} p_decision_t;
static inline int parity(int x, unsigned char* Partab)
{
- x ^= (x >> 16);
- x ^= (x >> 8);
- return Partab[x];
+ x ^= (x >> 16);
+ x ^= (x >> 8);
+ return Partab[x];
}
static inline int chainback_viterbi(unsigned char* data,
unsigned int tailsize,
unsigned char* decisions)
{
- unsigned char* d;
- int d_ADDSHIFT = 0;
- int d_numstates = (1 << 6);
- int d_decision_t_size = d_numstates/8;
- unsigned int d_k = 7;
- int d_framebits = nbits;
- /* ADDSHIFT and SUBSHIFT make sure that the thing returned is a byte. */
- d = decisions;
- /* Make room beyond the end of the encoder register so we can
- * accumulate a full byte of decoded data
- */
-
- endstate = (endstate%d_numstates) << d_ADDSHIFT;
-
- /* The store into data[] only needs to be done every 8 bits.
- * But this avoids a conditional branch, and the writes will
- * combine in the cache anyway
- */
-
- d += tailsize * d_decision_t_size ; /* Look past tail */
- int retval;
- int dif = tailsize - (d_k - 1);
- //printf("break, %d, %d\n", dif, (nbits+dif)%d_framebits);
- p_decision_t dec;
- while(nbits-- > d_framebits - (d_k - 1)) {
- int k;
- dec.t = &d[nbits * d_decision_t_size];
- k = (dec.w[(endstate>>d_ADDSHIFT)/32] >> ((endstate>>d_ADDSHIFT)%32)) & 1;
-
- endstate = (endstate >> 1) | (k << (d_k-2+d_ADDSHIFT));
- //data[((nbits+dif)%nbits)>>3] = endstate>>d_SUBSHIFT;
- //printf("%d, %d\n", k, (nbits+dif)%d_framebits);
- data[((nbits+dif)%d_framebits)] = k;
-
- retval = endstate;
- }
- nbits += 1;
-
- while(nbits-- != 0) {
- int k;
-
- dec.t = &d[nbits * d_decision_t_size];
-
- k = (dec.w[(endstate>>d_ADDSHIFT)/32] >> ((endstate>>d_ADDSHIFT)%32)) & 1;
-
- endstate = (endstate >> 1) | (k << (d_k-2+d_ADDSHIFT));
- data[((nbits+dif)%d_framebits)] = k;
- }
- //printf("%d, %d, %d, %d, %d, %d, %d, %d\n", data[4095],data[4094],data[4093],data[4092],data[4091],data[4090],data[4089],data[4088]);
-
-
- return retval >> d_ADDSHIFT;
+ unsigned char* d;
+ int d_ADDSHIFT = 0;
+ int d_numstates = (1 << 6);
+ int d_decision_t_size = d_numstates / 8;
+ unsigned int d_k = 7;
+ int d_framebits = nbits;
+ /* ADDSHIFT and SUBSHIFT make sure that the thing returned is a byte. */
+ d = decisions;
+ /* Make room beyond the end of the encoder register so we can
+ * accumulate a full byte of decoded data
+ */
+
+ endstate = (endstate % d_numstates) << d_ADDSHIFT;
+
+ /* The store into data[] only needs to be done every 8 bits.
+ * But this avoids a conditional branch, and the writes will
+ * combine in the cache anyway
+ */
+
+ d += tailsize * d_decision_t_size; /* Look past tail */
+ int retval;
+ int dif = tailsize - (d_k - 1);
+ // printf("break, %d, %d\n", dif, (nbits+dif)%d_framebits);
+ p_decision_t dec;
+ while (nbits-- > d_framebits - (d_k - 1)) {
+ int k;
+ dec.t = &d[nbits * d_decision_t_size];
+ k = (dec.w[(endstate >> d_ADDSHIFT) / 32] >> ((endstate >> d_ADDSHIFT) % 32)) & 1;
+
+ endstate = (endstate >> 1) | (k << (d_k - 2 + d_ADDSHIFT));
+ // data[((nbits+dif)%nbits)>>3] = endstate>>d_SUBSHIFT;
+ // printf("%d, %d\n", k, (nbits+dif)%d_framebits);
+ data[((nbits + dif) % d_framebits)] = k;
+
+ retval = endstate;
+ }
+ nbits += 1;
+
+ while (nbits-- != 0) {
+ int k;
+
+ dec.t = &d[nbits * d_decision_t_size];
+
+ k = (dec.w[(endstate >> d_ADDSHIFT) / 32] >> ((endstate >> d_ADDSHIFT) % 32)) & 1;
+
+ endstate = (endstate >> 1) | (k << (d_k - 2 + d_ADDSHIFT));
+ data[((nbits + dif) % d_framebits)] = k;
+ }
+ // printf("%d, %d, %d, %d, %d, %d, %d, %d\n",
+ // data[4095],data[4094],data[4093],data[4092],data[4091],data[4090],data[4089],data[4088]);
+
+
+ return retval >> d_ADDSHIFT;
}
#if LV_HAVE_SSE3
-#include <pmmintrin.h>
#include <emmintrin.h>
-#include <xmmintrin.h>
#include <mmintrin.h>
+#include <pmmintrin.h>
#include <stdio.h>
+#include <xmmintrin.h>
-static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms, unsigned char* dec, unsigned int framebits) {
-
-
- static int once = 1;
- int d_numstates = (1 << 6);
- int rate = 2;
- static unsigned char* D;
- static unsigned char* Y;
- static unsigned char* X;
- static unsigned int excess = 6;
- static unsigned char* Branchtab;
- static unsigned char Partab[256];
-
- int d_polys[2] = {79, 109};
-
-
- if(once) {
-
- X = (unsigned char*)volk_malloc(2*d_numstates, volk_get_alignment());
- Y = X + d_numstates;
- Branchtab = (unsigned char*)volk_malloc(d_numstates/2*rate, volk_get_alignment());
- D = (unsigned char*)volk_malloc((d_numstates/8) * (framebits + 6), volk_get_alignment());
- int state, i;
- int cnt,ti;
-
- /* Initialize parity lookup table */
- for(i=0;i<256;i++){
- cnt = 0;
- ti = i;
- while(ti){
- if(ti & 1)
- cnt++;
- ti >>= 1;
- }
- Partab[i] = cnt & 1;
- }
- /* Initialize the branch table */
- for(state=0;state < d_numstates/2;state++){
- for(i=0; i<rate; i++){
- Branchtab[i*d_numstates/2+state] = parity((2*state) & d_polys[i], Partab) ? 255 : 0;
- }
- }
+static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms,
+ unsigned char* dec,
+ unsigned int framebits)
+{
- once = 0;
- }
- //unbias the old_metrics
- memset(X, 31, d_numstates);
+ static int once = 1;
+ int d_numstates = (1 << 6);
+ int rate = 2;
+ static unsigned char* D;
+ static unsigned char* Y;
+ static unsigned char* X;
+ static unsigned int excess = 6;
+ static unsigned char* Branchtab;
+ static unsigned char Partab[256];
+
+ int d_polys[2] = { 79, 109 };
+
+
+ if (once) {
+
+ X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment());
+ Y = X + d_numstates;
+ Branchtab =
+ (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment());
+ D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
+ volk_get_alignment());
+ int state, i;
+ int cnt, ti;
+
+ /* Initialize parity lookup table */
+ for (i = 0; i < 256; i++) {
+ cnt = 0;
+ ti = i;
+ while (ti) {
+ if (ti & 1)
+ cnt++;
+ ti >>= 1;
+ }
+ Partab[i] = cnt & 1;
+ }
+ /* Initialize the branch table */
+ for (state = 0; state < d_numstates / 2; state++) {
+ for (i = 0; i < rate; i++) {
+ Branchtab[i * d_numstates / 2 + state] =
+ parity((2 * state) & d_polys[i], Partab) ? 255 : 0;
+ }
+ }
+
+ once = 0;
+ }
+
+ // unbias the old_metrics
+ memset(X, 31, d_numstates);
- // initialize decisions
- memset(D, 0, (d_numstates/8) * (framebits + 6));
+ // initialize decisions
+ memset(D, 0, (d_numstates / 8) * (framebits + 6));
- volk_8u_x4_conv_k7_r2_8u_spiral(Y, X, syms, D, framebits/2 - excess, excess, Branchtab);
+ volk_8u_x4_conv_k7_r2_8u_spiral(
+ Y, X, syms, D, framebits / 2 - excess, excess, Branchtab);
- unsigned int min = X[0];
- int i = 0, state = 0;
- for(i = 0; i < (d_numstates); ++i) {
- if(X[i] < min) {
- min = X[i];
- state = i;
+ unsigned int min = X[0];
+ int i = 0, state = 0;
+ for (i = 0; i < (d_numstates); ++i) {
+ if (X[i] < min) {
+ min = X[i];
+ state = i;
+ }
}
- }
- chainback_viterbi(dec, framebits/2 -excess, state, excess, D);
+ chainback_viterbi(dec, framebits / 2 - excess, state, excess, D);
- return;
+ return;
}
#endif /*LV_HAVE_SSE3*/
#include <immintrin.h>
#include <stdio.h>
-static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* syms, unsigned char* dec, unsigned int framebits) {
-
-
- static int once = 1;
- int d_numstates = (1 << 6);
- int rate = 2;
- static unsigned char* D;
- static unsigned char* Y;
- static unsigned char* X;
- static unsigned int excess = 6;
- static unsigned char* Branchtab;
- static unsigned char Partab[256];
-
- int d_polys[2] = {79, 109};
-
-
- if(once) {
-
- X = (unsigned char*)volk_malloc(2*d_numstates, volk_get_alignment());
- Y = X + d_numstates;
- Branchtab = (unsigned char*)volk_malloc(d_numstates/2*rate, volk_get_alignment());
- D = (unsigned char*)volk_malloc((d_numstates/8) * (framebits + 6), volk_get_alignment());
- int state, i;
- int cnt,ti;
-
- /* Initialize parity lookup table */
- for(i=0;i<256;i++){
- cnt = 0;
- ti = i;
- while(ti){
- if(ti & 1)
- cnt++;
- ti >>= 1;
- }
- Partab[i] = cnt & 1;
- }
- /* Initialize the branch table */
- for(state=0;state < d_numstates/2;state++){
- for(i=0; i<rate; i++){
- Branchtab[i*d_numstates/2+state] = parity((2*state) & d_polys[i], Partab) ? 255 : 0;
- }
- }
+static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* syms,
+ unsigned char* dec,
+ unsigned int framebits)
+{
- once = 0;
- }
- //unbias the old_metrics
- memset(X, 31, d_numstates);
+ static int once = 1;
+ int d_numstates = (1 << 6);
+ int rate = 2;
+ static unsigned char* D;
+ static unsigned char* Y;
+ static unsigned char* X;
+ static unsigned int excess = 6;
+ static unsigned char* Branchtab;
+ static unsigned char Partab[256];
+
+ int d_polys[2] = { 79, 109 };
+
+
+ if (once) {
+
+ X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment());
+ Y = X + d_numstates;
+ Branchtab =
+ (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment());
+ D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
+ volk_get_alignment());
+ int state, i;
+ int cnt, ti;
+
+ /* Initialize parity lookup table */
+ for (i = 0; i < 256; i++) {
+ cnt = 0;
+ ti = i;
+ while (ti) {
+ if (ti & 1)
+ cnt++;
+ ti >>= 1;
+ }
+ Partab[i] = cnt & 1;
+ }
+ /* Initialize the branch table */
+ for (state = 0; state < d_numstates / 2; state++) {
+ for (i = 0; i < rate; i++) {
+ Branchtab[i * d_numstates / 2 + state] =
+ parity((2 * state) & d_polys[i], Partab) ? 255 : 0;
+ }
+ }
+
+ once = 0;
+ }
+
+ // unbias the old_metrics
+ memset(X, 31, d_numstates);
- // initialize decisions
- memset(D, 0, (d_numstates/8) * (framebits + 6));
+ // initialize decisions
+ memset(D, 0, (d_numstates / 8) * (framebits + 6));
- volk_8u_x4_conv_k7_r2_8u_avx2(Y, X, syms, D, framebits/2 - excess, excess, Branchtab);
+ volk_8u_x4_conv_k7_r2_8u_avx2(
+ Y, X, syms, D, framebits / 2 - excess, excess, Branchtab);
- unsigned int min = X[0];
- int i = 0, state = 0;
- for(i = 0; i < (d_numstates); ++i) {
- if(X[i] < min) {
- min = X[i];
- state = i;
+ unsigned int min = X[0];
+ int i = 0, state = 0;
+ for (i = 0; i < (d_numstates); ++i) {
+ if (X[i] < min) {
+ min = X[i];
+ state = i;
+ }
}
- }
- chainback_viterbi(dec, framebits/2 -excess, state, excess, D);
+ chainback_viterbi(dec, framebits / 2 - excess, state, excess, D);
- return;
+ return;
}
#endif /*LV_HAVE_AVX2*/
-
#if LV_HAVE_GENERIC
-static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* syms, unsigned char* dec, unsigned int framebits) {
-
-
-
- static int once = 1;
- int d_numstates = (1 << 6);
- int rate = 2;
- static unsigned char* Y;
- static unsigned char* X;
- static unsigned char* D;
- static unsigned int excess = 6;
- static unsigned char* Branchtab;
- static unsigned char Partab[256];
-
- int d_polys[2] = {79, 109};
-
-
- if(once) {
-
- X = (unsigned char*)volk_malloc(2*d_numstates, volk_get_alignment());
- Y = X + d_numstates;
- Branchtab = (unsigned char*)volk_malloc(d_numstates/2*rate, volk_get_alignment());
- D = (unsigned char*)volk_malloc((d_numstates/8) * (framebits + 6), volk_get_alignment());
+static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* syms,
+ unsigned char* dec,
+ unsigned int framebits)
+{
- int state, i;
- int cnt,ti;
- /* Initialize parity lookup table */
- for(i=0;i<256;i++){
- cnt = 0;
- ti = i;
- while(ti){
- if(ti & 1)
- cnt++;
- ti >>= 1;
- }
- Partab[i] = cnt & 1;
+ static int once = 1;
+ int d_numstates = (1 << 6);
+ int rate = 2;
+ static unsigned char* Y;
+ static unsigned char* X;
+ static unsigned char* D;
+ static unsigned int excess = 6;
+ static unsigned char* Branchtab;
+ static unsigned char Partab[256];
+
+ int d_polys[2] = { 79, 109 };
+
+
+ if (once) {
+
+ X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment());
+ Y = X + d_numstates;
+ Branchtab =
+ (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment());
+ D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
+ volk_get_alignment());
+
+ int state, i;
+ int cnt, ti;
+
+ /* Initialize parity lookup table */
+ for (i = 0; i < 256; i++) {
+ cnt = 0;
+ ti = i;
+ while (ti) {
+ if (ti & 1)
+ cnt++;
+ ti >>= 1;
+ }
+ Partab[i] = cnt & 1;
+ }
+ /* Initialize the branch table */
+ for (state = 0; state < d_numstates / 2; state++) {
+ for (i = 0; i < rate; i++) {
+ Branchtab[i * d_numstates / 2 + state] =
+ parity((2 * state) & d_polys[i], Partab) ? 255 : 0;
+ }
+ }
+
+ once = 0;
}
- /* Initialize the branch table */
- for(state=0;state < d_numstates/2;state++){
- for(i=0; i<rate; i++){
- Branchtab[i*d_numstates/2+state] = parity((2*state) & d_polys[i], Partab) ? 255 : 0;
- }
- }
-
- once = 0;
- }
- //unbias the old_metrics
- memset(X, 31, d_numstates);
+ // unbias the old_metrics
+ memset(X, 31, d_numstates);
- // initialize decisions
- memset(D, 0, (d_numstates/8) * (framebits + 6));
+ // initialize decisions
+ memset(D, 0, (d_numstates / 8) * (framebits + 6));
- volk_8u_x4_conv_k7_r2_8u_generic(Y, X, syms, D, framebits/2 - excess, excess, Branchtab);
+ volk_8u_x4_conv_k7_r2_8u_generic(
+ Y, X, syms, D, framebits / 2 - excess, excess, Branchtab);
- unsigned int min = X[0];
- int i = 0, state = 0;
- for(i = 0; i < (d_numstates); ++i) {
- if(X[i] < min) {
- min = X[i];
- state = i;
+ unsigned int min = X[0];
+ int i = 0, state = 0;
+ for (i = 0; i < (d_numstates); ++i) {
+ if (X[i] < min) {
+ min = X[i];
+ state = i;
+ }
}
- }
-
- chainback_viterbi(dec, framebits/2 -excess, state, excess, D);
-
- return;
+ chainback_viterbi(dec, framebits / 2 - excess, state, excess, D);
+ return;
}
#endif /* LV_HAVE_GENERIC */
#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
#include <string.h>
-static inline unsigned int
-log2_of_power_of_2(unsigned int val){
- // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog
- static const unsigned int b[] = {0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0,
- 0xFF00FF00, 0xFFFF0000};
-
- unsigned int res = (val & b[0]) != 0;
- res |= ((val & b[4]) != 0) << 4;
- res |= ((val & b[3]) != 0) << 3;
- res |= ((val & b[2]) != 0) << 2;
- res |= ((val & b[1]) != 0) << 1;
- return res;
+static inline unsigned int log2_of_power_of_2(unsigned int val)
+{
+ // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog
+ static const unsigned int b[] = {
+ 0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000
+ };
+
+ unsigned int res = (val & b[0]) != 0;
+ res |= ((val & b[4]) != 0) << 4;
+ res |= ((val & b[3]) != 0) << 3;
+ res |= ((val & b[2]) != 0) << 2;
+ res |= ((val & b[1]) != 0) << 1;
+ return res;
}
-static inline void
-encodepolar_single_stage(unsigned char* frame_ptr, const unsigned char* temp_ptr,
- const unsigned int num_branches, const unsigned int frame_half)
+static inline void encodepolar_single_stage(unsigned char* frame_ptr,
+ const unsigned char* temp_ptr,
+ const unsigned int num_branches,
+ const unsigned int frame_half)
{
- unsigned int branch, bit;
- for(branch = 0; branch < num_branches; ++branch){
- for(bit = 0; bit < frame_half; ++bit){
- *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
- *(frame_ptr + frame_half) = *(temp_ptr + 1);
- ++frame_ptr;
- temp_ptr += 2;
+ unsigned int branch, bit;
+ for (branch = 0; branch < num_branches; ++branch) {
+ for (bit = 0; bit < frame_half; ++bit) {
+ *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
+ *(frame_ptr + frame_half) = *(temp_ptr + 1);
+ ++frame_ptr;
+ temp_ptr += 2;
+ }
+ frame_ptr += frame_half;
}
- frame_ptr += frame_half;
- }
}
#ifdef LV_HAVE_GENERIC
-static inline void
-volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame, unsigned char* temp,
- unsigned int frame_size)
+static inline void volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame,
+ unsigned char* temp,
+ unsigned int frame_size)
{
- unsigned int stage = log2_of_power_of_2(frame_size);
- unsigned int frame_half = frame_size >> 1;
- unsigned int num_branches = 1;
-
- while(stage){
- // encode stage
- encodepolar_single_stage(frame, temp, num_branches, frame_half);
- memcpy(temp, frame, sizeof(unsigned char) * frame_size);
-
- // update all the parameters.
- num_branches = num_branches << 1;
- frame_half = frame_half >> 1;
- --stage;
- }
+ unsigned int stage = log2_of_power_of_2(frame_size);
+ unsigned int frame_half = frame_size >> 1;
+ unsigned int num_branches = 1;
+
+ while (stage) {
+ // encode stage
+ encodepolar_single_stage(frame, temp, num_branches, frame_half);
+ memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+
+ // update all the parameters.
+ num_branches = num_branches << 1;
+ frame_half = frame_half >> 1;
+ --stage;
+ }
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSSE3
#include <tmmintrin.h>
-static inline void
-volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame, unsigned char* temp,
- unsigned int frame_size)
+static inline void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame,
+ unsigned char* temp,
+ unsigned int frame_size)
{
- const unsigned int po2 = log2_of_power_of_2(frame_size);
-
- unsigned int stage = po2;
- unsigned char* frame_ptr = frame;
- unsigned char* temp_ptr = temp;
-
- unsigned int frame_half = frame_size >> 1;
- unsigned int num_branches = 1;
- unsigned int branch;
- unsigned int bit;
-
- // prepare constants
- const __m128i mask_stage1 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
-
- // get some SIMD registers to play with.
- __m128i r_frame0, r_temp0, shifted;
-
- {
- __m128i r_frame1, r_temp1;
- const __m128i shuffle_separate = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
-
- while(stage > 4){
- frame_ptr = frame;
- temp_ptr = temp;
-
- // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
- for(branch = 0; branch < num_branches; ++branch){
- for(bit = 0; bit < frame_half; bit += 16){
- r_temp0 = _mm_loadu_si128((__m128i *) temp_ptr);
- temp_ptr += 16;
- r_temp1 = _mm_loadu_si128((__m128i *) temp_ptr);
- temp_ptr += 16;
-
- shifted = _mm_srli_si128(r_temp0, 1);
- shifted = _mm_and_si128(shifted, mask_stage1);
- r_temp0 = _mm_xor_si128(shifted, r_temp0);
- r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
-
- shifted = _mm_srli_si128(r_temp1, 1);
- shifted = _mm_and_si128(shifted, mask_stage1);
- r_temp1 = _mm_xor_si128(shifted, r_temp1);
- r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
-
- r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
- _mm_storeu_si128((__m128i*) frame_ptr, r_frame0);
-
- r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
- _mm_storeu_si128((__m128i*) (frame_ptr + frame_half), r_frame1);
- frame_ptr += 16;
+ const unsigned int po2 = log2_of_power_of_2(frame_size);
+
+ unsigned int stage = po2;
+ unsigned char* frame_ptr = frame;
+ unsigned char* temp_ptr = temp;
+
+ unsigned int frame_half = frame_size >> 1;
+ unsigned int num_branches = 1;
+ unsigned int branch;
+ unsigned int bit;
+
+ // prepare constants
+ const __m128i mask_stage1 = _mm_set_epi8(0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF);
+
+ // get some SIMD registers to play with.
+ __m128i r_frame0, r_temp0, shifted;
+
+ {
+ __m128i r_frame1, r_temp1;
+ const __m128i shuffle_separate =
+ _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+
+ while (stage > 4) {
+ frame_ptr = frame;
+ temp_ptr = temp;
+
+ // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
+ for (branch = 0; branch < num_branches; ++branch) {
+ for (bit = 0; bit < frame_half; bit += 16) {
+ r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
+ temp_ptr += 16;
+ r_temp1 = _mm_loadu_si128((__m128i*)temp_ptr);
+ temp_ptr += 16;
+
+ shifted = _mm_srli_si128(r_temp0, 1);
+ shifted = _mm_and_si128(shifted, mask_stage1);
+ r_temp0 = _mm_xor_si128(shifted, r_temp0);
+ r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
+
+ shifted = _mm_srli_si128(r_temp1, 1);
+ shifted = _mm_and_si128(shifted, mask_stage1);
+ r_temp1 = _mm_xor_si128(shifted, r_temp1);
+ r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
+
+ r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
+ _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
+
+ r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
+ _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
+ frame_ptr += 16;
+ }
+
+ frame_ptr += frame_half;
+ }
+ memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+
+ num_branches = num_branches << 1;
+ frame_half = frame_half >> 1;
+ stage--;
}
-
- frame_ptr += frame_half;
- }
- memcpy(temp, frame, sizeof(unsigned char) * frame_size);
-
- num_branches = num_branches << 1;
- frame_half = frame_half >> 1;
- stage--;
}
- }
- // This last part requires at least 16-bit frames.
- // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
+ // This last part requires at least 16-bit frames.
+ // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
- // reset pointers to correct positions.
- frame_ptr = frame;
- temp_ptr = temp;
+ // reset pointers to correct positions.
+ frame_ptr = frame;
+ temp_ptr = temp;
- // prefetch first chunk
- __VOLK_PREFETCH(temp_ptr);
-
- const __m128i shuffle_stage4 = _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
- const __m128i mask_stage4 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
- const __m128i mask_stage3 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
- const __m128i mask_stage2 = _mm_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
-
- for(branch = 0; branch < num_branches; ++branch){
- r_temp0 = _mm_loadu_si128((__m128i*) temp_ptr);
-
- // prefetch next chunk
- temp_ptr += 16;
+ // prefetch first chunk
__VOLK_PREFETCH(temp_ptr);
- // shuffle once for bit-reversal.
- r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
-
- shifted = _mm_srli_si128(r_temp0, 8);
- shifted = _mm_and_si128(shifted, mask_stage4);
- r_frame0 = _mm_xor_si128(shifted, r_temp0);
-
- shifted = _mm_srli_si128(r_frame0, 4);
- shifted = _mm_and_si128(shifted, mask_stage3);
- r_frame0 = _mm_xor_si128(shifted, r_frame0);
-
- shifted = _mm_srli_si128(r_frame0, 2);
- shifted = _mm_and_si128(shifted, mask_stage2);
- r_frame0 = _mm_xor_si128(shifted, r_frame0);
-
- shifted = _mm_srli_si128(r_frame0, 1);
- shifted = _mm_and_si128(shifted, mask_stage1);
- r_frame0 = _mm_xor_si128(shifted, r_frame0);
-
- // store result of chunk.
- _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
- frame_ptr += 16;
- }
+ const __m128i shuffle_stage4 =
+ _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
+ const __m128i mask_stage4 = _mm_set_epi8(0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF);
+ const __m128i mask_stage3 = _mm_set_epi8(0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF);
+ const __m128i mask_stage2 = _mm_set_epi8(0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF);
+
+ for (branch = 0; branch < num_branches; ++branch) {
+ r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
+
+ // prefetch next chunk
+ temp_ptr += 16;
+ __VOLK_PREFETCH(temp_ptr);
+
+ // shuffle once for bit-reversal.
+ r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
+
+ shifted = _mm_srli_si128(r_temp0, 8);
+ shifted = _mm_and_si128(shifted, mask_stage4);
+ r_frame0 = _mm_xor_si128(shifted, r_temp0);
+
+ shifted = _mm_srli_si128(r_frame0, 4);
+ shifted = _mm_and_si128(shifted, mask_stage3);
+ r_frame0 = _mm_xor_si128(shifted, r_frame0);
+
+ shifted = _mm_srli_si128(r_frame0, 2);
+ shifted = _mm_and_si128(shifted, mask_stage2);
+ r_frame0 = _mm_xor_si128(shifted, r_frame0);
+
+ shifted = _mm_srli_si128(r_frame0, 1);
+ shifted = _mm_and_si128(shifted, mask_stage1);
+ r_frame0 = _mm_xor_si128(shifted, r_frame0);
+
+ // store result of chunk.
+ _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
+ frame_ptr += 16;
+ }
}
#endif /* LV_HAVE_SSSE3 */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame, unsigned char* temp,
- unsigned int frame_size)
+static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame,
+ unsigned char* temp,
+ unsigned int frame_size)
{
- const unsigned int po2 = log2_of_power_of_2(frame_size);
-
- unsigned int stage = po2;
- unsigned char* frame_ptr = frame;
- unsigned char* temp_ptr = temp;
-
- unsigned int frame_half = frame_size >> 1;
- unsigned int num_branches = 1;
- unsigned int branch;
- unsigned int bit;
-
- // prepare constants
- const __m256i mask_stage1 = _mm256_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF,
- 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
-
- const __m128i mask_stage0 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
- // get some SIMD registers to play with.
- __m256i r_frame0, r_temp0, shifted;
- __m128i r_temp2, r_frame2, shifted2;
- {
- __m256i r_frame1, r_temp1;
- __m128i r_frame3, r_temp3;
- const __m256i shuffle_separate = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
- 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
- const __m128i shuffle_separate128 = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
-
- while(stage > 4){
- frame_ptr = frame;
- temp_ptr = temp;
-
- // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
- for(branch = 0; branch < num_branches; ++branch){
- for(bit = 0; bit < frame_half; bit += 32){
- if ((frame_half-bit)<32) //if only 16 bits remaining in frame, not 32
- {
- r_temp2 = _mm_loadu_si128((__m128i *) temp_ptr);
- temp_ptr += 16;
- r_temp3 = _mm_loadu_si128((__m128i *) temp_ptr);
- temp_ptr += 16;
-
- shifted2 = _mm_srli_si128(r_temp2, 1);
- shifted2 = _mm_and_si128(shifted2, mask_stage0);
- r_temp2 = _mm_xor_si128(shifted2, r_temp2);
- r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
-
- shifted2 = _mm_srli_si128(r_temp3, 1);
- shifted2 = _mm_and_si128(shifted2, mask_stage0);
- r_temp3 = _mm_xor_si128(shifted2, r_temp3);
- r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
-
- r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
- _mm_storeu_si128((__m128i*) frame_ptr, r_frame2);
-
- r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
- _mm_storeu_si128((__m128i*) (frame_ptr + frame_half), r_frame3);
- frame_ptr += 16;
- break;
- }
- r_temp0 = _mm256_loadu_si256((__m256i *) temp_ptr);
- temp_ptr += 32;
- r_temp1 = _mm256_loadu_si256((__m256i *) temp_ptr);
- temp_ptr += 32;
-
- shifted = _mm256_srli_si256(r_temp0, 1);//operate on 128 bit lanes
- shifted = _mm256_and_si256(shifted, mask_stage1);
- r_temp0 = _mm256_xor_si256(shifted, r_temp0);
- r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
-
- shifted = _mm256_srli_si256(r_temp1, 1);
- shifted = _mm256_and_si256(shifted, mask_stage1);
- r_temp1 = _mm256_xor_si256(shifted, r_temp1);
- r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
-
- r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
- r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
- r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
- r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
-
- _mm256_storeu_si256((__m256i*) frame_ptr, r_frame0);
-
- _mm256_storeu_si256((__m256i*) (frame_ptr + frame_half), r_frame1);
- frame_ptr += 32;
+ const unsigned int po2 = log2_of_power_of_2(frame_size);
+
+ unsigned int stage = po2;
+ unsigned char* frame_ptr = frame;
+ unsigned char* temp_ptr = temp;
+
+ unsigned int frame_half = frame_size >> 1;
+ unsigned int num_branches = 1;
+ unsigned int branch;
+ unsigned int bit;
+
+ // prepare constants
+ const __m256i mask_stage1 = _mm256_set_epi8(0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF);
+
+ const __m128i mask_stage0 = _mm_set_epi8(0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF);
+ // get some SIMD registers to play with.
+ __m256i r_frame0, r_temp0, shifted;
+ __m128i r_temp2, r_frame2, shifted2;
+ {
+ __m256i r_frame1, r_temp1;
+ __m128i r_frame3, r_temp3;
+ const __m256i shuffle_separate = _mm256_setr_epi8(0,
+ 2,
+ 4,
+ 6,
+ 8,
+ 10,
+ 12,
+ 14,
+ 1,
+ 3,
+ 5,
+ 7,
+ 9,
+ 11,
+ 13,
+ 15,
+ 0,
+ 2,
+ 4,
+ 6,
+ 8,
+ 10,
+ 12,
+ 14,
+ 1,
+ 3,
+ 5,
+ 7,
+ 9,
+ 11,
+ 13,
+ 15);
+ const __m128i shuffle_separate128 =
+ _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+
+ while (stage > 4) {
+ frame_ptr = frame;
+ temp_ptr = temp;
+
+ // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
+ for (branch = 0; branch < num_branches; ++branch) {
+ for (bit = 0; bit < frame_half; bit += 32) {
+ if ((frame_half - bit) <
+ 32) // if only 16 bits remaining in frame, not 32
+ {
+ r_temp2 = _mm_loadu_si128((__m128i*)temp_ptr);
+ temp_ptr += 16;
+ r_temp3 = _mm_loadu_si128((__m128i*)temp_ptr);
+ temp_ptr += 16;
+
+ shifted2 = _mm_srli_si128(r_temp2, 1);
+ shifted2 = _mm_and_si128(shifted2, mask_stage0);
+ r_temp2 = _mm_xor_si128(shifted2, r_temp2);
+ r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
+
+ shifted2 = _mm_srli_si128(r_temp3, 1);
+ shifted2 = _mm_and_si128(shifted2, mask_stage0);
+ r_temp3 = _mm_xor_si128(shifted2, r_temp3);
+ r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
+
+ r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
+ _mm_storeu_si128((__m128i*)frame_ptr, r_frame2);
+
+ r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
+ _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
+ frame_ptr += 16;
+ break;
+ }
+ r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
+ temp_ptr += 32;
+ r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr);
+ temp_ptr += 32;
+
+ shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
+ shifted = _mm256_and_si256(shifted, mask_stage1);
+ r_temp0 = _mm256_xor_si256(shifted, r_temp0);
+ r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
+
+ shifted = _mm256_srli_si256(r_temp1, 1);
+ shifted = _mm256_and_si256(shifted, mask_stage1);
+ r_temp1 = _mm256_xor_si256(shifted, r_temp1);
+ r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
+
+ r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
+ r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
+ r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
+ r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
+
+ _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
+
+ _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
+ frame_ptr += 32;
+ }
+
+ frame_ptr += frame_half;
+ }
+ memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+
+ num_branches = num_branches << 1;
+ frame_half = frame_half >> 1;
+ stage--;
}
-
- frame_ptr += frame_half;
- }
- memcpy(temp, frame, sizeof(unsigned char) * frame_size);
-
- num_branches = num_branches << 1;
- frame_half = frame_half >> 1;
- stage--;
}
- }
-
- // This last part requires at least 32-bit frames.
- // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
-
- // reset pointers to correct positions.
- frame_ptr = frame;
- temp_ptr = temp;
- // prefetch first chunk
- __VOLK_PREFETCH(temp_ptr);
+ // This last part requires at least 32-bit frames.
+ // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
- const __m256i shuffle_stage4 = _mm256_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
- 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
- const __m256i mask_stage4 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
- 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
- const __m256i mask_stage3 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF,
- 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
- const __m256i mask_stage2 = _mm256_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF,
- 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
+ // reset pointers to correct positions.
+ frame_ptr = frame;
+ temp_ptr = temp;
- for(branch = 0; branch < num_branches/2; ++branch){
- r_temp0 = _mm256_loadu_si256((__m256i*) temp_ptr);
-
- // prefetch next chunk
- temp_ptr += 32;
+ // prefetch first chunk
__VOLK_PREFETCH(temp_ptr);
- // shuffle once for bit-reversal.
- r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
-
- shifted = _mm256_srli_si256(r_temp0, 8); //128 bit lanes
- shifted = _mm256_and_si256(shifted, mask_stage4);
- r_frame0 = _mm256_xor_si256(shifted, r_temp0);
-
-
- shifted = _mm256_srli_si256(r_frame0, 4);
- shifted = _mm256_and_si256(shifted, mask_stage3);
- r_frame0 = _mm256_xor_si256(shifted, r_frame0);
-
- shifted = _mm256_srli_si256(r_frame0, 2);
- shifted = _mm256_and_si256(shifted, mask_stage2);
- r_frame0 = _mm256_xor_si256(shifted, r_frame0);
-
- shifted = _mm256_srli_si256(r_frame0, 1);
- shifted = _mm256_and_si256(shifted, mask_stage1);
- r_frame0 = _mm256_xor_si256(shifted, r_frame0);
-
- // store result of chunk.
- _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
- frame_ptr += 32;
- }
+ const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
+ 8,
+ 4,
+ 12,
+ 2,
+ 10,
+ 6,
+ 14,
+ 1,
+ 9,
+ 5,
+ 13,
+ 3,
+ 11,
+ 7,
+ 15,
+ 0,
+ 8,
+ 4,
+ 12,
+ 2,
+ 10,
+ 6,
+ 14,
+ 1,
+ 9,
+ 5,
+ 13,
+ 3,
+ 11,
+ 7,
+ 15);
+ const __m256i mask_stage4 = _mm256_set_epi8(0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF);
+ const __m256i mask_stage3 = _mm256_set_epi8(0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF);
+ const __m256i mask_stage2 = _mm256_set_epi8(0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF);
+
+ for (branch = 0; branch < num_branches / 2; ++branch) {
+ r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
+
+ // prefetch next chunk
+ temp_ptr += 32;
+ __VOLK_PREFETCH(temp_ptr);
+
+ // shuffle once for bit-reversal.
+ r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
+
+ shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
+ shifted = _mm256_and_si256(shifted, mask_stage4);
+ r_frame0 = _mm256_xor_si256(shifted, r_temp0);
+
+
+ shifted = _mm256_srli_si256(r_frame0, 4);
+ shifted = _mm256_and_si256(shifted, mask_stage3);
+ r_frame0 = _mm256_xor_si256(shifted, r_frame0);
+
+ shifted = _mm256_srli_si256(r_frame0, 2);
+ shifted = _mm256_and_si256(shifted, mask_stage2);
+ r_frame0 = _mm256_xor_si256(shifted, r_frame0);
+
+ shifted = _mm256_srli_si256(r_frame0, 1);
+ shifted = _mm256_and_si256(shifted, mask_stage1);
+ r_frame0 = _mm256_xor_si256(shifted, r_frame0);
+
+ // store result of chunk.
+ _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
+ frame_ptr += 32;
+ }
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSSE3
#include <tmmintrin.h>
-static inline void
-volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame, unsigned char* temp,
- unsigned int frame_size)
+static inline void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame,
+ unsigned char* temp,
+ unsigned int frame_size)
{
- const unsigned int po2 = log2_of_power_of_2(frame_size);
-
- unsigned int stage = po2;
- unsigned char* frame_ptr = frame;
- unsigned char* temp_ptr = temp;
-
- unsigned int frame_half = frame_size >> 1;
- unsigned int num_branches = 1;
- unsigned int branch;
- unsigned int bit;
-
- // prepare constants
- const __m128i mask_stage1 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
-
- // get some SIMD registers to play with.
- __m128i r_frame0, r_temp0, shifted;
-
- {
- __m128i r_frame1, r_temp1;
- const __m128i shuffle_separate = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
-
- while(stage > 4){
- frame_ptr = frame;
- temp_ptr = temp;
-
- // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
- for(branch = 0; branch < num_branches; ++branch){
- for(bit = 0; bit < frame_half; bit += 16){
- r_temp0 = _mm_load_si128((__m128i *) temp_ptr);
- temp_ptr += 16;
- r_temp1 = _mm_load_si128((__m128i *) temp_ptr);
- temp_ptr += 16;
-
- shifted = _mm_srli_si128(r_temp0, 1);
- shifted = _mm_and_si128(shifted, mask_stage1);
- r_temp0 = _mm_xor_si128(shifted, r_temp0);
- r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
-
- shifted = _mm_srli_si128(r_temp1, 1);
- shifted = _mm_and_si128(shifted, mask_stage1);
- r_temp1 = _mm_xor_si128(shifted, r_temp1);
- r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
-
- r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
- _mm_store_si128((__m128i*) frame_ptr, r_frame0);
-
- r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
- _mm_store_si128((__m128i*) (frame_ptr + frame_half), r_frame1);
- frame_ptr += 16;
+ const unsigned int po2 = log2_of_power_of_2(frame_size);
+
+ unsigned int stage = po2;
+ unsigned char* frame_ptr = frame;
+ unsigned char* temp_ptr = temp;
+
+ unsigned int frame_half = frame_size >> 1;
+ unsigned int num_branches = 1;
+ unsigned int branch;
+ unsigned int bit;
+
+ // prepare constants
+ const __m128i mask_stage1 = _mm_set_epi8(0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF);
+
+ // get some SIMD registers to play with.
+ __m128i r_frame0, r_temp0, shifted;
+
+ {
+ __m128i r_frame1, r_temp1;
+ const __m128i shuffle_separate =
+ _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+
+ while (stage > 4) {
+ frame_ptr = frame;
+ temp_ptr = temp;
+
+ // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
+ for (branch = 0; branch < num_branches; ++branch) {
+ for (bit = 0; bit < frame_half; bit += 16) {
+ r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
+ temp_ptr += 16;
+ r_temp1 = _mm_load_si128((__m128i*)temp_ptr);
+ temp_ptr += 16;
+
+ shifted = _mm_srli_si128(r_temp0, 1);
+ shifted = _mm_and_si128(shifted, mask_stage1);
+ r_temp0 = _mm_xor_si128(shifted, r_temp0);
+ r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
+
+ shifted = _mm_srli_si128(r_temp1, 1);
+ shifted = _mm_and_si128(shifted, mask_stage1);
+ r_temp1 = _mm_xor_si128(shifted, r_temp1);
+ r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
+
+ r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
+ _mm_store_si128((__m128i*)frame_ptr, r_frame0);
+
+ r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
+ _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
+ frame_ptr += 16;
+ }
+
+ frame_ptr += frame_half;
+ }
+ memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+
+ num_branches = num_branches << 1;
+ frame_half = frame_half >> 1;
+ stage--;
}
-
- frame_ptr += frame_half;
- }
- memcpy(temp, frame, sizeof(unsigned char) * frame_size);
-
- num_branches = num_branches << 1;
- frame_half = frame_half >> 1;
- stage--;
}
- }
-
- // This last part requires at least 16-bit frames.
- // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
-
- // reset pointers to correct positions.
- frame_ptr = frame;
- temp_ptr = temp;
- // prefetch first chunk
- __VOLK_PREFETCH(temp_ptr);
+ // This last part requires at least 16-bit frames.
+ // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
- const __m128i shuffle_stage4 = _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
- const __m128i mask_stage4 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
- const __m128i mask_stage3 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
- const __m128i mask_stage2 = _mm_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
+ // reset pointers to correct positions.
+ frame_ptr = frame;
+ temp_ptr = temp;
- for(branch = 0; branch < num_branches; ++branch){
- r_temp0 = _mm_load_si128((__m128i*) temp_ptr);
-
- // prefetch next chunk
- temp_ptr += 16;
+ // prefetch first chunk
__VOLK_PREFETCH(temp_ptr);
- // shuffle once for bit-reversal.
- r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
-
- shifted = _mm_srli_si128(r_temp0, 8);
- shifted = _mm_and_si128(shifted, mask_stage4);
- r_frame0 = _mm_xor_si128(shifted, r_temp0);
-
- shifted = _mm_srli_si128(r_frame0, 4);
- shifted = _mm_and_si128(shifted, mask_stage3);
- r_frame0 = _mm_xor_si128(shifted, r_frame0);
-
- shifted = _mm_srli_si128(r_frame0, 2);
- shifted = _mm_and_si128(shifted, mask_stage2);
- r_frame0 = _mm_xor_si128(shifted, r_frame0);
-
- shifted = _mm_srli_si128(r_frame0, 1);
- shifted = _mm_and_si128(shifted, mask_stage1);
- r_frame0 = _mm_xor_si128(shifted, r_frame0);
-
- // store result of chunk.
- _mm_store_si128((__m128i*)frame_ptr, r_frame0);
- frame_ptr += 16;
- }
+ const __m128i shuffle_stage4 =
+ _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
+ const __m128i mask_stage4 = _mm_set_epi8(0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF);
+ const __m128i mask_stage3 = _mm_set_epi8(0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF);
+ const __m128i mask_stage2 = _mm_set_epi8(0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF);
+
+ for (branch = 0; branch < num_branches; ++branch) {
+ r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
+
+ // prefetch next chunk
+ temp_ptr += 16;
+ __VOLK_PREFETCH(temp_ptr);
+
+ // shuffle once for bit-reversal.
+ r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
+
+ shifted = _mm_srli_si128(r_temp0, 8);
+ shifted = _mm_and_si128(shifted, mask_stage4);
+ r_frame0 = _mm_xor_si128(shifted, r_temp0);
+
+ shifted = _mm_srli_si128(r_frame0, 4);
+ shifted = _mm_and_si128(shifted, mask_stage3);
+ r_frame0 = _mm_xor_si128(shifted, r_frame0);
+
+ shifted = _mm_srli_si128(r_frame0, 2);
+ shifted = _mm_and_si128(shifted, mask_stage2);
+ r_frame0 = _mm_xor_si128(shifted, r_frame0);
+
+ shifted = _mm_srli_si128(r_frame0, 1);
+ shifted = _mm_and_si128(shifted, mask_stage1);
+ r_frame0 = _mm_xor_si128(shifted, r_frame0);
+
+ // store result of chunk.
+ _mm_store_si128((__m128i*)frame_ptr, r_frame0);
+ frame_ptr += 16;
+ }
}
#endif /* LV_HAVE_SSSE3 */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
-static inline void
-volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame, unsigned char* temp,
- unsigned int frame_size)
+static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame,
+ unsigned char* temp,
+ unsigned int frame_size)
{
- const unsigned int po2 = log2_of_power_of_2(frame_size);
-
- unsigned int stage = po2;
- unsigned char* frame_ptr = frame;
- unsigned char* temp_ptr = temp;
-
- unsigned int frame_half = frame_size >> 1;
- unsigned int num_branches = 1;
- unsigned int branch;
- unsigned int bit;
-
- // prepare constants
- const __m256i mask_stage1 = _mm256_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF,
- 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
-
- const __m128i mask_stage0 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
- // get some SIMD registers to play with.
- __m256i r_frame0, r_temp0, shifted;
- __m128i r_temp2, r_frame2, shifted2;
- {
- __m256i r_frame1, r_temp1;
- __m128i r_frame3, r_temp3;
- const __m256i shuffle_separate = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
- 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
- const __m128i shuffle_separate128 = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
-
- while(stage > 4){
- frame_ptr = frame;
- temp_ptr = temp;
-
- // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
- for(branch = 0; branch < num_branches; ++branch){
- for(bit = 0; bit < frame_half; bit += 32){
- if ((frame_half-bit)<32) //if only 16 bits remaining in frame, not 32
- {
- r_temp2 = _mm_load_si128((__m128i *) temp_ptr);
- temp_ptr += 16;
- r_temp3 = _mm_load_si128((__m128i *) temp_ptr);
- temp_ptr += 16;
-
- shifted2 = _mm_srli_si128(r_temp2, 1);
- shifted2 = _mm_and_si128(shifted2, mask_stage0);
- r_temp2 = _mm_xor_si128(shifted2, r_temp2);
- r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
-
- shifted2 = _mm_srli_si128(r_temp3, 1);
- shifted2 = _mm_and_si128(shifted2, mask_stage0);
- r_temp3 = _mm_xor_si128(shifted2, r_temp3);
- r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
-
- r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
- _mm_store_si128((__m128i*) frame_ptr, r_frame2);
-
- r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
- _mm_store_si128((__m128i*) (frame_ptr + frame_half), r_frame3);
- frame_ptr += 16;
- break;
- }
- r_temp0 = _mm256_load_si256((__m256i *) temp_ptr);
- temp_ptr += 32;
- r_temp1 = _mm256_load_si256((__m256i *) temp_ptr);
- temp_ptr += 32;
-
- shifted = _mm256_srli_si256(r_temp0, 1);//operate on 128 bit lanes
- shifted = _mm256_and_si256(shifted, mask_stage1);
- r_temp0 = _mm256_xor_si256(shifted, r_temp0);
- r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
-
- shifted = _mm256_srli_si256(r_temp1, 1);
- shifted = _mm256_and_si256(shifted, mask_stage1);
- r_temp1 = _mm256_xor_si256(shifted, r_temp1);
- r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
-
- r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
- r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
- r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
- r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
-
- _mm256_store_si256((__m256i*) frame_ptr, r_frame0);
-
- _mm256_store_si256((__m256i*) (frame_ptr + frame_half), r_frame1);
- frame_ptr += 32;
+ const unsigned int po2 = log2_of_power_of_2(frame_size);
+
+ unsigned int stage = po2;
+ unsigned char* frame_ptr = frame;
+ unsigned char* temp_ptr = temp;
+
+ unsigned int frame_half = frame_size >> 1;
+ unsigned int num_branches = 1;
+ unsigned int branch;
+ unsigned int bit;
+
+ // prepare constants
+ const __m256i mask_stage1 = _mm256_set_epi8(0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF);
+
+ const __m128i mask_stage0 = _mm_set_epi8(0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF,
+ 0x0,
+ 0xFF);
+ // get some SIMD registers to play with.
+ __m256i r_frame0, r_temp0, shifted;
+ __m128i r_temp2, r_frame2, shifted2;
+ {
+ __m256i r_frame1, r_temp1;
+ __m128i r_frame3, r_temp3;
+ const __m256i shuffle_separate = _mm256_setr_epi8(0,
+ 2,
+ 4,
+ 6,
+ 8,
+ 10,
+ 12,
+ 14,
+ 1,
+ 3,
+ 5,
+ 7,
+ 9,
+ 11,
+ 13,
+ 15,
+ 0,
+ 2,
+ 4,
+ 6,
+ 8,
+ 10,
+ 12,
+ 14,
+ 1,
+ 3,
+ 5,
+ 7,
+ 9,
+ 11,
+ 13,
+ 15);
+ const __m128i shuffle_separate128 =
+ _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+
+ while (stage > 4) {
+ frame_ptr = frame;
+ temp_ptr = temp;
+
+ // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
+ for (branch = 0; branch < num_branches; ++branch) {
+ for (bit = 0; bit < frame_half; bit += 32) {
+ if ((frame_half - bit) <
+ 32) // if only 16 bits remaining in frame, not 32
+ {
+ r_temp2 = _mm_load_si128((__m128i*)temp_ptr);
+ temp_ptr += 16;
+ r_temp3 = _mm_load_si128((__m128i*)temp_ptr);
+ temp_ptr += 16;
+
+ shifted2 = _mm_srli_si128(r_temp2, 1);
+ shifted2 = _mm_and_si128(shifted2, mask_stage0);
+ r_temp2 = _mm_xor_si128(shifted2, r_temp2);
+ r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
+
+ shifted2 = _mm_srli_si128(r_temp3, 1);
+ shifted2 = _mm_and_si128(shifted2, mask_stage0);
+ r_temp3 = _mm_xor_si128(shifted2, r_temp3);
+ r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
+
+ r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
+ _mm_store_si128((__m128i*)frame_ptr, r_frame2);
+
+ r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
+ _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
+ frame_ptr += 16;
+ break;
+ }
+ r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
+ temp_ptr += 32;
+ r_temp1 = _mm256_load_si256((__m256i*)temp_ptr);
+ temp_ptr += 32;
+
+ shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
+ shifted = _mm256_and_si256(shifted, mask_stage1);
+ r_temp0 = _mm256_xor_si256(shifted, r_temp0);
+ r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
+
+ shifted = _mm256_srli_si256(r_temp1, 1);
+ shifted = _mm256_and_si256(shifted, mask_stage1);
+ r_temp1 = _mm256_xor_si256(shifted, r_temp1);
+ r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
+
+ r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
+ r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
+ r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
+ r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
+
+ _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
+
+ _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
+ frame_ptr += 32;
+ }
+
+ frame_ptr += frame_half;
+ }
+ memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+
+ num_branches = num_branches << 1;
+ frame_half = frame_half >> 1;
+ stage--;
}
-
- frame_ptr += frame_half;
- }
- memcpy(temp, frame, sizeof(unsigned char) * frame_size);
-
- num_branches = num_branches << 1;
- frame_half = frame_half >> 1;
- stage--;
}
- }
-
- // This last part requires at least 32-bit frames.
- // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
- // reset pointers to correct positions.
- frame_ptr = frame;
- temp_ptr = temp;
+ // This last part requires at least 32-bit frames.
+ // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
- // prefetch first chunk.
- __VOLK_PREFETCH(temp_ptr);
+ // reset pointers to correct positions.
+ frame_ptr = frame;
+ temp_ptr = temp;
- const __m256i shuffle_stage4 = _mm256_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
- 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
- const __m256i mask_stage4 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
- 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
- const __m256i mask_stage3 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF,
- 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
- const __m256i mask_stage2 = _mm256_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF,
- 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
-
- for(branch = 0; branch < num_branches/2; ++branch){
- r_temp0 = _mm256_load_si256((__m256i*) temp_ptr);
-
- // prefetch next chunk
- temp_ptr += 32;
+ // prefetch first chunk.
__VOLK_PREFETCH(temp_ptr);
- // shuffle once for bit-reversal.
- r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
-
- shifted = _mm256_srli_si256(r_temp0, 8); //128 bit lanes
- shifted = _mm256_and_si256(shifted, mask_stage4);
- r_frame0 = _mm256_xor_si256(shifted, r_temp0);
-
- shifted = _mm256_srli_si256(r_frame0, 4);
- shifted = _mm256_and_si256(shifted, mask_stage3);
- r_frame0 = _mm256_xor_si256(shifted, r_frame0);
-
- shifted = _mm256_srli_si256(r_frame0, 2);
- shifted = _mm256_and_si256(shifted, mask_stage2);
- r_frame0 = _mm256_xor_si256(shifted, r_frame0);
-
- shifted = _mm256_srli_si256(r_frame0, 1);
- shifted = _mm256_and_si256(shifted, mask_stage1);
- r_frame0 = _mm256_xor_si256(shifted, r_frame0);
-
- // store result of chunk.
- _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
- frame_ptr += 32;
- }
+ const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
+ 8,
+ 4,
+ 12,
+ 2,
+ 10,
+ 6,
+ 14,
+ 1,
+ 9,
+ 5,
+ 13,
+ 3,
+ 11,
+ 7,
+ 15,
+ 0,
+ 8,
+ 4,
+ 12,
+ 2,
+ 10,
+ 6,
+ 14,
+ 1,
+ 9,
+ 5,
+ 13,
+ 3,
+ 11,
+ 7,
+ 15);
+ const __m256i mask_stage4 = _mm256_set_epi8(0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF);
+ const __m256i mask_stage3 = _mm256_set_epi8(0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0xFF,
+ 0xFF);
+ const __m256i mask_stage2 = _mm256_set_epi8(0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF,
+ 0x0,
+ 0x0,
+ 0xFF,
+ 0xFF);
+
+ for (branch = 0; branch < num_branches / 2; ++branch) {
+ r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
+
+ // prefetch next chunk
+ temp_ptr += 32;
+ __VOLK_PREFETCH(temp_ptr);
+
+ // shuffle once for bit-reversal.
+ r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
+
+ shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
+ shifted = _mm256_and_si256(shifted, mask_stage4);
+ r_frame0 = _mm256_xor_si256(shifted, r_temp0);
+
+ shifted = _mm256_srli_si256(r_frame0, 4);
+ shifted = _mm256_and_si256(shifted, mask_stage3);
+ r_frame0 = _mm256_xor_si256(shifted, r_frame0);
+
+ shifted = _mm256_srli_si256(r_frame0, 2);
+ shifted = _mm256_and_si256(shifted, mask_stage2);
+ r_frame0 = _mm256_xor_si256(shifted, r_frame0);
+
+ shifted = _mm256_srli_si256(r_frame0, 1);
+ shifted = _mm256_and_si256(shifted, mask_stage1);
+ r_frame0 = _mm256_xor_si256(shifted, r_frame0);
+
+ // store result of chunk.
+ _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
+ frame_ptr += 32;
+ }
}
#endif /* LV_HAVE_AVX2 */
-
#endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_8u_x3_encodepolar_8u(unsigned char* frame, const unsigned char* frozen_bit_mask, const unsigned char* frozen_bits,
- * const unsigned char* info_bits, unsigned int frame_size, unsigned int info_bit_size)
- * \endcode
+ * void volk_8u_x3_encodepolar_8u(unsigned char* frame, const unsigned char*
+ * frozen_bit_mask, const unsigned char* frozen_bits, const unsigned char* info_bits,
+ * unsigned int frame_size, unsigned int info_bit_size) \endcode
*
* \b Inputs
* \li frame: buffer for encoded frame
* unsigned char* frozen_bit_mask = get_frozen_bit_mask(frame_size, num_frozen_bits);
*
* // set elements to desired values. Typically all zero.
- * unsigned char* frozen_bits = (unsigned char) volk_malloc(sizeof(unsigned char) * num_frozen_bits, volk_get_alignment());
+ * unsigned char* frozen_bits = (unsigned char) volk_malloc(sizeof(unsigned char) *
+ * num_frozen_bits, volk_get_alignment());
*
- * unsigned char* frame = (unsigned char) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
- * unsigned char* temp = (unsigned char) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
+ * unsigned char* frame = (unsigned char) volk_malloc(sizeof(unsigned char) * frame_size,
+ * volk_get_alignment()); unsigned char* temp = (unsigned char)
+ * volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
*
* unsigned char* info_bits = get_info_bits_to_encode(num_info_bits);
*
- * volk_8u_x3_encodepolar_8u_x2_generic(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+ * volk_8u_x3_encodepolar_8u_x2_generic(frame, temp, frozen_bit_mask, frozen_bits,
+ * info_bits, frame_size);
*
* volk_free(frozen_bit_mask);
* volk_free(frozen_bits);
#include <stdio.h>
#include <volk/volk_8u_x2_encodeframepolar_8u.h>
-static inline void
-interleave_frozen_and_info_bits(unsigned char* target, const unsigned char* frozen_bit_mask,
- const unsigned char* frozen_bits, const unsigned char* info_bits,
- const unsigned int frame_size)
+static inline void interleave_frozen_and_info_bits(unsigned char* target,
+ const unsigned char* frozen_bit_mask,
+ const unsigned char* frozen_bits,
+ const unsigned char* info_bits,
+ const unsigned int frame_size)
{
- unsigned int bit;
- for(bit = 0; bit < frame_size; ++bit){
- *target++ = *frozen_bit_mask++ ? *frozen_bits++ : *info_bits++;
- }
+ unsigned int bit;
+ for (bit = 0; bit < frame_size; ++bit) {
+ *target++ = *frozen_bit_mask++ ? *frozen_bits++ : *info_bits++;
+ }
}
#ifdef LV_HAVE_GENERIC
static inline void
-volk_8u_x3_encodepolar_8u_x2_generic(unsigned char* frame, unsigned char* temp, const unsigned char* frozen_bit_mask,
- const unsigned char* frozen_bits, const unsigned char* info_bits,
+volk_8u_x3_encodepolar_8u_x2_generic(unsigned char* frame,
+ unsigned char* temp,
+ const unsigned char* frozen_bit_mask,
+ const unsigned char* frozen_bits,
+ const unsigned char* info_bits,
unsigned int frame_size)
{
- // interleave
- interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
- volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size);
+ // interleave
+ interleave_frozen_and_info_bits(
+ temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+ volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size);
}
#endif /* LV_HAVE_GENERIC */
#include <tmmintrin.h>
static inline void
-volk_8u_x3_encodepolar_8u_x2_u_ssse3(unsigned char* frame, unsigned char* temp,
- const unsigned char* frozen_bit_mask,
- const unsigned char* frozen_bits, const unsigned char* info_bits,
- unsigned int frame_size)
+volk_8u_x3_encodepolar_8u_x2_u_ssse3(unsigned char* frame,
+ unsigned char* temp,
+ const unsigned char* frozen_bit_mask,
+ const unsigned char* frozen_bits,
+ const unsigned char* info_bits,
+ unsigned int frame_size)
{
- // interleave
- interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
- volk_8u_x2_encodeframepolar_8u_u_ssse3(frame, temp, frame_size);
+ // interleave
+ interleave_frozen_and_info_bits(
+ temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+ volk_8u_x2_encodeframepolar_8u_u_ssse3(frame, temp, frame_size);
}
#endif /* LV_HAVE_SSSE3 */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void
-volk_8u_x3_encodepolar_8u_x2_u_avx2(unsigned char* frame, unsigned char* temp,
- const unsigned char* frozen_bit_mask,
- const unsigned char* frozen_bits, const unsigned char* info_bits,
- unsigned int frame_size)
+volk_8u_x3_encodepolar_8u_x2_u_avx2(unsigned char* frame,
+ unsigned char* temp,
+ const unsigned char* frozen_bit_mask,
+ const unsigned char* frozen_bits,
+ const unsigned char* info_bits,
+ unsigned int frame_size)
{
- interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
- volk_8u_x2_encodeframepolar_8u_u_avx2(frame, temp, frame_size);
+ interleave_frozen_and_info_bits(
+ temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+ volk_8u_x2_encodeframepolar_8u_u_avx2(frame, temp, frame_size);
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSSE3
#include <tmmintrin.h>
static inline void
-volk_8u_x3_encodepolar_8u_x2_a_ssse3(unsigned char* frame, unsigned char* temp,
- const unsigned char* frozen_bit_mask,
- const unsigned char* frozen_bits, const unsigned char* info_bits,
- unsigned int frame_size)
+volk_8u_x3_encodepolar_8u_x2_a_ssse3(unsigned char* frame,
+ unsigned char* temp,
+ const unsigned char* frozen_bit_mask,
+ const unsigned char* frozen_bits,
+ const unsigned char* info_bits,
+ unsigned int frame_size)
{
- interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
- volk_8u_x2_encodeframepolar_8u_a_ssse3(frame, temp, frame_size);
+ interleave_frozen_and_info_bits(
+ temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+ volk_8u_x2_encodeframepolar_8u_a_ssse3(frame, temp, frame_size);
}
#endif /* LV_HAVE_SSSE3 */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void
-volk_8u_x3_encodepolar_8u_x2_a_avx2(unsigned char* frame, unsigned char* temp,
- const unsigned char* frozen_bit_mask,
- const unsigned char* frozen_bits, const unsigned char* info_bits,
- unsigned int frame_size)
+volk_8u_x3_encodepolar_8u_x2_a_avx2(unsigned char* frame,
+ unsigned char* temp,
+ const unsigned char* frozen_bit_mask,
+ const unsigned char* frozen_bits,
+ const unsigned char* info_bits,
+ unsigned int frame_size)
{
- interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
- volk_8u_x2_encodeframepolar_8u_a_avx2(frame, temp, frame_size);
+ interleave_frozen_and_info_bits(
+ temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+ volk_8u_x2_encodeframepolar_8u_a_avx2(frame, temp, frame_size);
}
#endif /* LV_HAVE_AVX2 */
#include <volk/volk.h>
#include <volk/volk_8u_x3_encodepolar_8u_x2.h>
-static inline unsigned int
-next_lower_power_of_two(const unsigned int val)
+static inline unsigned int next_lower_power_of_two(const unsigned int val)
{
- // algorithm found and adopted from: http://acius2.blogspot.de/2007/11/calculating-next-power-of-2.html
- unsigned int res = val;
- res = (res >> 1) | res;
- res = (res >> 2) | res;
- res = (res >> 4) | res;
- res = (res >> 8) | res;
- res = (res >> 16) | res;
- res += 1;
- return res >> 1;
+ // algorithm found and adopted from:
+ // http://acius2.blogspot.de/2007/11/calculating-next-power-of-2.html
+ unsigned int res = val;
+ res = (res >> 1) | res;
+ res = (res >> 2) | res;
+ res = (res >> 4) | res;
+ res = (res >> 8) | res;
+ res = (res >> 16) | res;
+ res += 1;
+ return res >> 1;
}
-static inline void
-adjust_frozen_mask(unsigned char* mask, const unsigned int frame_size)
+static inline void adjust_frozen_mask(unsigned char* mask, const unsigned int frame_size)
{
- // just like the rest of the puppet this function exists for test purposes only.
- unsigned int i;
- for(i = 0; i < frame_size; ++i){
- *mask = (*mask & 0x80) ? 0xFF : 0x00;
- mask++;
- }
+ // just like the rest of the puppet this function exists for test purposes only.
+ unsigned int i;
+ for (i = 0; i < frame_size; ++i) {
+ *mask = (*mask & 0x80) ? 0xFF : 0x00;
+ mask++;
+ }
}
#ifdef LV_HAVE_GENERIC
static inline void
-volk_8u_x3_encodepolarpuppet_8u_generic(unsigned char* frame, unsigned char* frozen_bit_mask,
- const unsigned char* frozen_bits, const unsigned char* info_bits,
- unsigned int frame_size)
+volk_8u_x3_encodepolarpuppet_8u_generic(unsigned char* frame,
+ unsigned char* frozen_bit_mask,
+ const unsigned char* frozen_bits,
+ const unsigned char* info_bits,
+ unsigned int frame_size)
{
- frame_size = next_lower_power_of_two(frame_size);
- unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
- adjust_frozen_mask(frozen_bit_mask, frame_size);
- volk_8u_x3_encodepolar_8u_x2_generic(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
- volk_free(temp);
+ frame_size = next_lower_power_of_two(frame_size);
+ unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size,
+ volk_get_alignment());
+ adjust_frozen_mask(frozen_bit_mask, frame_size);
+ volk_8u_x3_encodepolar_8u_x2_generic(
+ frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+ volk_free(temp);
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSSE3
static inline void
-volk_8u_x3_encodepolarpuppet_8u_u_ssse3(unsigned char* frame, unsigned char* frozen_bit_mask,
- const unsigned char* frozen_bits, const unsigned char* info_bits,
- unsigned int frame_size)
+volk_8u_x3_encodepolarpuppet_8u_u_ssse3(unsigned char* frame,
+ unsigned char* frozen_bit_mask,
+ const unsigned char* frozen_bits,
+ const unsigned char* info_bits,
+ unsigned int frame_size)
{
- frame_size = next_lower_power_of_two(frame_size);
- unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
- adjust_frozen_mask(frozen_bit_mask, frame_size);
- volk_8u_x3_encodepolar_8u_x2_u_ssse3(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
- volk_free(temp);
+ frame_size = next_lower_power_of_two(frame_size);
+ unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size,
+ volk_get_alignment());
+ adjust_frozen_mask(frozen_bit_mask, frame_size);
+ volk_8u_x3_encodepolar_8u_x2_u_ssse3(
+ frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+ volk_free(temp);
}
#endif /* LV_HAVE_SSSE3 */
#ifdef LV_HAVE_AVX2
static inline void
-volk_8u_x3_encodepolarpuppet_8u_u_avx2(unsigned char* frame, unsigned char* frozen_bit_mask,
- const unsigned char* frozen_bits, const unsigned char* info_bits,
- unsigned int frame_size)
+volk_8u_x3_encodepolarpuppet_8u_u_avx2(unsigned char* frame,
+ unsigned char* frozen_bit_mask,
+ const unsigned char* frozen_bits,
+ const unsigned char* info_bits,
+ unsigned int frame_size)
{
- frame_size = next_lower_power_of_two(frame_size);
- unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
- adjust_frozen_mask(frozen_bit_mask, frame_size);
- volk_8u_x3_encodepolar_8u_x2_u_avx2(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
- volk_free(temp);
+ frame_size = next_lower_power_of_two(frame_size);
+ unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size,
+ volk_get_alignment());
+ adjust_frozen_mask(frozen_bit_mask, frame_size);
+ volk_8u_x3_encodepolar_8u_x2_u_avx2(
+ frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+ volk_free(temp);
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSSE3
static inline void
-volk_8u_x3_encodepolarpuppet_8u_a_ssse3(unsigned char* frame, unsigned char* frozen_bit_mask,
- const unsigned char* frozen_bits, const unsigned char* info_bits,
- unsigned int frame_size)
+volk_8u_x3_encodepolarpuppet_8u_a_ssse3(unsigned char* frame,
+ unsigned char* frozen_bit_mask,
+ const unsigned char* frozen_bits,
+ const unsigned char* info_bits,
+ unsigned int frame_size)
{
- frame_size = next_lower_power_of_two(frame_size);
- unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
- adjust_frozen_mask(frozen_bit_mask, frame_size);
- volk_8u_x3_encodepolar_8u_x2_a_ssse3(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
- volk_free(temp);
+ frame_size = next_lower_power_of_two(frame_size);
+ unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size,
+ volk_get_alignment());
+ adjust_frozen_mask(frozen_bit_mask, frame_size);
+ volk_8u_x3_encodepolar_8u_x2_a_ssse3(
+ frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+ volk_free(temp);
}
#endif /* LV_HAVE_SSSE3 */
#ifdef LV_HAVE_AVX2
static inline void
-volk_8u_x3_encodepolarpuppet_8u_a_avx2(unsigned char* frame, unsigned char* frozen_bit_mask,
- const unsigned char* frozen_bits, const unsigned char* info_bits,
- unsigned int frame_size)
+volk_8u_x3_encodepolarpuppet_8u_a_avx2(unsigned char* frame,
+ unsigned char* frozen_bit_mask,
+ const unsigned char* frozen_bits,
+ const unsigned char* info_bits,
+ unsigned int frame_size)
{
- frame_size = next_lower_power_of_two(frame_size);
- unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
- adjust_frozen_mask(frozen_bit_mask, frame_size);
- volk_8u_x3_encodepolar_8u_x2_a_avx2(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
- volk_free(temp);
+ frame_size = next_lower_power_of_two(frame_size);
+ unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size,
+ volk_get_alignment());
+ adjust_frozen_mask(frozen_bit_mask, frame_size);
+ volk_8u_x3_encodepolar_8u_x2_a_avx2(
+ frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+ volk_free(temp);
}
#endif /* LV_HAVE_AVX2 */
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_8u_x4_conv_k7_r2_8u(unsigned char* Y, unsigned char* X, unsigned char* syms, unsigned char* dec, unsigned int framebits, unsigned int excess, unsigned char* Branchtab)
- * \endcode
+ * void volk_8u_x4_conv_k7_r2_8u(unsigned char* Y, unsigned char* X, unsigned char* syms,
+ * unsigned char* dec, unsigned int framebits, unsigned int excess, unsigned char*
+ * Branchtab) \endcode
*
* \b Inputs
* \li X: <FIXME>
#define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
typedef union {
- unsigned char/*DECISIONTYPE*/ t[64/*NUMSTATES*//8/*DECISIONTYPE_BITSIZE*/];
- unsigned int w[64/*NUMSTATES*//32];
- unsigned short s[64/*NUMSTATES*//16];
- unsigned char c[64/*NUMSTATES*//8];
+ unsigned char /*DECISIONTYPE*/ t[64 /*NUMSTATES*/ / 8 /*DECISIONTYPE_BITSIZE*/];
+ unsigned int w[64 /*NUMSTATES*/ / 32];
+ unsigned short s[64 /*NUMSTATES*/ / 16];
+ unsigned char c[64 /*NUMSTATES*/ / 8];
#ifdef _MSC_VER
} decision_t;
#else
-} decision_t __attribute__ ((aligned (16)));
+} decision_t __attribute__((aligned(16)));
#endif
-static inline void
-renormalize(unsigned char* X, unsigned char threshold)
+static inline void renormalize(unsigned char* X, unsigned char threshold)
{
- int NUMSTATES = 64;
- int i;
-
- unsigned char min=X[0];
- //if(min > threshold) {
- for(i=0;i<NUMSTATES;i++)
- if (min>X[i])
- min=X[i];
- for(i=0;i<NUMSTATES;i++)
- X[i]-=min;
- //}
+ int NUMSTATES = 64;
+ int i;
+
+ unsigned char min = X[0];
+ // if(min > threshold) {
+ for (i = 0; i < NUMSTATES; i++)
+ if (min > X[i])
+ min = X[i];
+ for (i = 0; i < NUMSTATES; i++)
+ X[i] -= min;
+ //}
}
-//helper BFLY for GENERIC version
-static inline void
-BFLY(int i, int s, unsigned char * syms, unsigned char *Y,
- unsigned char *X, decision_t * d, unsigned char* Branchtab)
+// helper BFLY for GENERIC version
+static inline void BFLY(int i,
+ int s,
+ unsigned char* syms,
+ unsigned char* Y,
+ unsigned char* X,
+ decision_t* d,
+ unsigned char* Branchtab)
{
- int j, decision0, decision1;
- unsigned char metric,m0,m1,m2,m3;
+ int j, decision0, decision1;
+ unsigned char metric, m0, m1, m2, m3;
- int NUMSTATES = 64;
- int RATE = 2;
- int METRICSHIFT = 1;
- int PRECISIONSHIFT = 2;
+ int NUMSTATES = 64;
+ int RATE = 2;
+ int METRICSHIFT = 1;
+ int PRECISIONSHIFT = 2;
- metric =0;
- for(j=0;j<RATE;j++)
- metric += (Branchtab[i+j*NUMSTATES/2] ^ syms[s*RATE+j])>>METRICSHIFT;
- metric=metric>>PRECISIONSHIFT;
+ metric = 0;
+ for (j = 0; j < RATE; j++)
+ metric += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]) >> METRICSHIFT;
+ metric = metric >> PRECISIONSHIFT;
- unsigned char max = ((RATE*((256 -1)>>METRICSHIFT))>>PRECISIONSHIFT);
+ unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT);
- m0 = X[i] + metric;
- m1 = X[i+NUMSTATES/2] + (max - metric);
- m2 = X[i] + (max - metric);
- m3 = X[i+NUMSTATES/2] + metric;
+ m0 = X[i] + metric;
+ m1 = X[i + NUMSTATES / 2] + (max - metric);
+ m2 = X[i] + (max - metric);
+ m3 = X[i + NUMSTATES / 2] + metric;
- decision0 = (signed int)(m0-m1) > 0;
- decision1 = (signed int)(m2-m3) > 0;
+ decision0 = (signed int)(m0 - m1) > 0;
+ decision1 = (signed int)(m2 - m3) > 0;
- Y[2*i] = decision0 ? m1 : m0;
- Y[2*i+1] = decision1 ? m3 : m2;
+ Y[2 * i] = decision0 ? m1 : m0;
+ Y[2 * i + 1] = decision1 ? m3 : m2;
- d->w[i/(sizeof(unsigned int)*8/2)+s*(sizeof(decision_t)/sizeof(unsigned int))] |=
- (decision0|decision1<<1) << ((2*i)&(sizeof(unsigned int)*8-1));
+ d->w[i / (sizeof(unsigned int) * 8 / 2) +
+ s * (sizeof(decision_t) / sizeof(unsigned int))] |=
+ (decision0 | decision1 << 1) << ((2 * i) & (sizeof(unsigned int) * 8 - 1));
}
#include <immintrin.h>
#include <stdio.h>
-static inline void
-volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, unsigned char* X,
- unsigned char* syms, unsigned char* dec,
- unsigned int framebits, unsigned int excess,
- unsigned char* Branchtab)
+static inline void volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y,
+ unsigned char* X,
+ unsigned char* syms,
+ unsigned char* dec,
+ unsigned int framebits,
+ unsigned int excess,
+ unsigned char* Branchtab)
{
- unsigned int i9;
- for(i9 = 0; i9 < ((framebits + excess)>>1); i9++) {
- unsigned char a75, a81;
- int a73, a92;
- int s20, s21;
- unsigned char *a80, *b6;
- int *a110, *a91, *a93;
- __m256i *a112, *a71, *a72, *a77, *a83, *a95;
- __m256i a86, a87;
- __m256i a76, a78, a79, a82, a84, a85, a88, a89
- , a90, d10, d9, m23, m24, m25
- , m26, s18, s19, s22
- , s23, s24, s25, t13, t14, t15;
- a71 = ((__m256i *) X);
- s18 = *(a71);
- a72 = (a71 + 1);
- s19 = *(a72);
- s22 = _mm256_permute2x128_si256(s18,s19,0x20);
- s19 = _mm256_permute2x128_si256(s18,s19,0x31);
- s18 = s22;
- a73 = (4 * i9);
- b6 = (syms + a73);
- a75 = *(b6);
- a76 = _mm256_set1_epi8(a75);
- a77 = ((__m256i *) Branchtab);
- a78 = *(a77);
- a79 = _mm256_xor_si256(a76, a78);
- a80 = (b6 + 1);
- a81 = *(a80);
- a82 = _mm256_set1_epi8(a81);
- a83 = (a77 + 1);
- a84 = *(a83);
- a85 = _mm256_xor_si256(a82, a84);
- t13 = _mm256_avg_epu8(a79,a85);
- a86 = ((__m256i ) t13);
- a87 = _mm256_srli_epi16(a86, 2);
- a88 = ((__m256i ) a87);
- t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
- t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
- m23 = _mm256_adds_epu8(s18, t14);
- m24 = _mm256_adds_epu8(s19, t15);
- m25 = _mm256_adds_epu8(s18, t15);
- m26 = _mm256_adds_epu8(s19, t14);
- a89 = _mm256_min_epu8(m24, m23);
- d9 = _mm256_cmpeq_epi8(a89, m24);
- a90 = _mm256_min_epu8(m26, m25);
- d10 = _mm256_cmpeq_epi8(a90, m26);
- s22 = _mm256_unpacklo_epi8(d9,d10);
- s23 = _mm256_unpackhi_epi8(d9,d10);
- s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
- a91 = ((int *) dec);
- a92 = (4 * i9);
- a93 = (a91 + a92);
- *(a93) = s20;
- s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
- a110 = (a93 + 1);
- *(a110) = s21;
- s22 = _mm256_unpacklo_epi8(a89, a90);
- s23 = _mm256_unpackhi_epi8(a89, a90);
- a95 = ((__m256i *) Y);
- s24 = _mm256_permute2x128_si256(s22, s23, 0x20);
- *(a95) = s24;
- s23 = _mm256_permute2x128_si256(s22, s23, 0x31);
- a112 = (a95 + 1);
- *(a112) = s23;
- if ((((unsigned char *) Y)[0]>210)) {
- __m256i m5, m6;
- m5 = ((__m256i *) Y)[0];
- m5 = _mm256_min_epu8(m5, ((__m256i *) Y)[1]);
- __m256i m7;
- m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
- m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 32)), ((__m256i ) m7)));
- m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 16)), ((__m256i ) m7)));
- m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 8)), ((__m256i ) m7)));
- m7 = _mm256_unpacklo_epi8(m7, m7);
- m7 = _mm256_shufflelo_epi16(m7, 0);
- m6 = _mm256_unpacklo_epi64(m7, m7);
- m6 = _mm256_permute2x128_si256(m6, m6, 0); //copy lower half of m6 to upper half, since above ops operate on 128 bit lanes
- ((__m256i *) Y)[0] = _mm256_subs_epu8(((__m256i *) Y)[0], m6);
- ((__m256i *) Y)[1] = _mm256_subs_epu8(((__m256i *) Y)[1], m6);
+ unsigned int i9;
+ for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
+ unsigned char a75, a81;
+ int a73, a92;
+ int s20, s21;
+ unsigned char *a80, *b6;
+ int *a110, *a91, *a93;
+ __m256i *a112, *a71, *a72, *a77, *a83, *a95;
+ __m256i a86, a87;
+ __m256i a76, a78, a79, a82, a84, a85, a88, a89, a90, d10, d9, m23, m24, m25, m26,
+ s18, s19, s22, s23, s24, s25, t13, t14, t15;
+ a71 = ((__m256i*)X);
+ s18 = *(a71);
+ a72 = (a71 + 1);
+ s19 = *(a72);
+ s22 = _mm256_permute2x128_si256(s18, s19, 0x20);
+ s19 = _mm256_permute2x128_si256(s18, s19, 0x31);
+ s18 = s22;
+ a73 = (4 * i9);
+ b6 = (syms + a73);
+ a75 = *(b6);
+ a76 = _mm256_set1_epi8(a75);
+ a77 = ((__m256i*)Branchtab);
+ a78 = *(a77);
+ a79 = _mm256_xor_si256(a76, a78);
+ a80 = (b6 + 1);
+ a81 = *(a80);
+ a82 = _mm256_set1_epi8(a81);
+ a83 = (a77 + 1);
+ a84 = *(a83);
+ a85 = _mm256_xor_si256(a82, a84);
+ t13 = _mm256_avg_epu8(a79, a85);
+ a86 = ((__m256i)t13);
+ a87 = _mm256_srli_epi16(a86, 2);
+ a88 = ((__m256i)a87);
+ t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
+ t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
+ m23 = _mm256_adds_epu8(s18, t14);
+ m24 = _mm256_adds_epu8(s19, t15);
+ m25 = _mm256_adds_epu8(s18, t15);
+ m26 = _mm256_adds_epu8(s19, t14);
+ a89 = _mm256_min_epu8(m24, m23);
+ d9 = _mm256_cmpeq_epi8(a89, m24);
+ a90 = _mm256_min_epu8(m26, m25);
+ d10 = _mm256_cmpeq_epi8(a90, m26);
+ s22 = _mm256_unpacklo_epi8(d9, d10);
+ s23 = _mm256_unpackhi_epi8(d9, d10);
+ s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
+ a91 = ((int*)dec);
+ a92 = (4 * i9);
+ a93 = (a91 + a92);
+ *(a93) = s20;
+ s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
+ a110 = (a93 + 1);
+ *(a110) = s21;
+ s22 = _mm256_unpacklo_epi8(a89, a90);
+ s23 = _mm256_unpackhi_epi8(a89, a90);
+ a95 = ((__m256i*)Y);
+ s24 = _mm256_permute2x128_si256(s22, s23, 0x20);
+ *(a95) = s24;
+ s23 = _mm256_permute2x128_si256(s22, s23, 0x31);
+ a112 = (a95 + 1);
+ *(a112) = s23;
+ if ((((unsigned char*)Y)[0] > 210)) {
+ __m256i m5, m6;
+ m5 = ((__m256i*)Y)[0];
+ m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]);
+ __m256i m7;
+ m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
+ m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)),
+ ((__m256i)m7)));
+ m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)),
+ ((__m256i)m7)));
+ m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)),
+ ((__m256i)m7)));
+ m7 = _mm256_unpacklo_epi8(m7, m7);
+ m7 = _mm256_shufflelo_epi16(m7, 0);
+ m6 = _mm256_unpacklo_epi64(m7, m7);
+ m6 = _mm256_permute2x128_si256(
+ m6, m6, 0); // copy lower half of m6 to upper half, since above ops
+ // operate on 128 bit lanes
+ ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6);
+ ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6);
+ }
+ unsigned char a188, a194;
+ int a205;
+ int s48, s54;
+ unsigned char *a187, *a193;
+ int *a204, *a206, *a223, *b16;
+ __m256i *a184, *a185, *a190, *a196, *a208, *a225;
+ __m256i a199, a200;
+ __m256i a189, a191, a192, a195, a197, a198, a201, a202, a203, d17, d18, m39, m40,
+ m41, m42, s46, s47, s50, s51, t25, t26, t27;
+ a184 = ((__m256i*)Y);
+ s46 = *(a184);
+ a185 = (a184 + 1);
+ s47 = *(a185);
+ s50 = _mm256_permute2x128_si256(s46, s47, 0x20);
+ s47 = _mm256_permute2x128_si256(s46, s47, 0x31);
+ s46 = s50;
+ a187 = (b6 + 2);
+ a188 = *(a187);
+ a189 = _mm256_set1_epi8(a188);
+ a190 = ((__m256i*)Branchtab);
+ a191 = *(a190);
+ a192 = _mm256_xor_si256(a189, a191);
+ a193 = (b6 + 3);
+ a194 = *(a193);
+ a195 = _mm256_set1_epi8(a194);
+ a196 = (a190 + 1);
+ a197 = *(a196);
+ a198 = _mm256_xor_si256(a195, a197);
+ t25 = _mm256_avg_epu8(a192, a198);
+ a199 = ((__m256i)t25);
+ a200 = _mm256_srli_epi16(a199, 2);
+ a201 = ((__m256i)a200);
+ t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63));
+ t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26);
+ m39 = _mm256_adds_epu8(s46, t26);
+ m40 = _mm256_adds_epu8(s47, t27);
+ m41 = _mm256_adds_epu8(s46, t27);
+ m42 = _mm256_adds_epu8(s47, t26);
+ a202 = _mm256_min_epu8(m40, m39);
+ d17 = _mm256_cmpeq_epi8(a202, m40);
+ a203 = _mm256_min_epu8(m42, m41);
+ d18 = _mm256_cmpeq_epi8(a203, m42);
+ s24 = _mm256_unpacklo_epi8(d17, d18);
+ s25 = _mm256_unpackhi_epi8(d17, d18);
+ s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
+ a204 = ((int*)dec);
+ a205 = (4 * i9);
+ b16 = (a204 + a205);
+ a206 = (b16 + 2);
+ *(a206) = s48;
+ s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
+ a223 = (b16 + 3);
+ *(a223) = s54;
+ s50 = _mm256_unpacklo_epi8(a202, a203);
+ s51 = _mm256_unpackhi_epi8(a202, a203);
+ s25 = _mm256_permute2x128_si256(s50, s51, 0x20);
+ s51 = _mm256_permute2x128_si256(s50, s51, 0x31);
+ a208 = ((__m256i*)X);
+ *(a208) = s25;
+ a225 = (a208 + 1);
+ *(a225) = s51;
+
+ if ((((unsigned char*)X)[0] > 210)) {
+ __m256i m12, m13;
+ m12 = ((__m256i*)X)[0];
+ m12 = _mm256_min_epu8(m12, ((__m256i*)X)[1]);
+ __m256i m14;
+ m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12);
+ m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 32)),
+ ((__m256i)m14)));
+ m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 16)),
+ ((__m256i)m14)));
+ m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 8)),
+ ((__m256i)m14)));
+ m14 = _mm256_unpacklo_epi8(m14, m14);
+ m14 = _mm256_shufflelo_epi16(m14, 0);
+ m13 = _mm256_unpacklo_epi64(m14, m14);
+ m13 = _mm256_permute2x128_si256(m13, m13, 0);
+ ((__m256i*)X)[0] = _mm256_subs_epu8(((__m256i*)X)[0], m13);
+ ((__m256i*)X)[1] = _mm256_subs_epu8(((__m256i*)X)[1], m13);
+ }
}
- unsigned char a188, a194;
- int a205;
- int s48, s54;
- unsigned char *a187, *a193;
- int *a204, *a206, *a223, *b16;
- __m256i *a184, *a185, *a190, *a196, *a208, *a225;
- __m256i a199, a200;
- __m256i a189, a191, a192, a195, a197, a198, a201
- , a202, a203, d17, d18, m39, m40, m41
- , m42, s46, s47, s50
- , s51, t25, t26, t27;
- a184 = ((__m256i *) Y);
- s46 = *(a184);
- a185 = (a184 + 1);
- s47 = *(a185);
- s50 = _mm256_permute2x128_si256(s46,s47,0x20);
- s47 = _mm256_permute2x128_si256(s46,s47,0x31);
- s46 = s50;
- a187 = (b6 + 2);
- a188 = *(a187);
- a189 = _mm256_set1_epi8(a188);
- a190 = ((__m256i *) Branchtab);
- a191 = *(a190);
- a192 = _mm256_xor_si256(a189, a191);
- a193 = (b6 + 3);
- a194 = *(a193);
- a195 = _mm256_set1_epi8(a194);
- a196 = (a190 + 1);
- a197 = *(a196);
- a198 = _mm256_xor_si256(a195, a197);
- t25 = _mm256_avg_epu8(a192,a198);
- a199 = ((__m256i ) t25);
- a200 = _mm256_srli_epi16(a199, 2);
- a201 = ((__m256i ) a200);
- t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63));
- t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26);
- m39 = _mm256_adds_epu8(s46, t26);
- m40 = _mm256_adds_epu8(s47, t27);
- m41 = _mm256_adds_epu8(s46, t27);
- m42 = _mm256_adds_epu8(s47, t26);
- a202 = _mm256_min_epu8(m40, m39);
- d17 = _mm256_cmpeq_epi8(a202, m40);
- a203 = _mm256_min_epu8(m42, m41);
- d18 = _mm256_cmpeq_epi8(a203, m42);
- s24 = _mm256_unpacklo_epi8(d17,d18);
- s25 = _mm256_unpackhi_epi8(d17,d18);
- s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
- a204 = ((int *) dec);
- a205 = (4 * i9);
- b16 = (a204 + a205);
- a206 = (b16 + 2);
- *(a206) = s48;
- s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
- a223 = (b16 + 3);
- *(a223) = s54;
- s50 = _mm256_unpacklo_epi8(a202, a203);
- s51 = _mm256_unpackhi_epi8(a202, a203);
- s25 = _mm256_permute2x128_si256(s50, s51, 0x20);
- s51 = _mm256_permute2x128_si256(s50, s51, 0x31);
- a208 = ((__m256i *) X);
- *(a208) = s25;
- a225 = (a208 + 1);
- *(a225) = s51;
-
- if ((((unsigned char *) X)[0]>210)) {
- __m256i m12, m13;
- m12 = ((__m256i *) X)[0];
- m12 = _mm256_min_epu8(m12, ((__m256i *) X)[1]);
- __m256i m14;
- m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12);
- m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 32)), ((__m256i ) m14)));
- m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 16)), ((__m256i ) m14)));
- m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 8)), ((__m256i ) m14)));
- m14 = _mm256_unpacklo_epi8(m14, m14);
- m14 = _mm256_shufflelo_epi16(m14, 0);
- m13 = _mm256_unpacklo_epi64(m14, m14);
- m13 = _mm256_permute2x128_si256(m13, m13, 0);
- ((__m256i *) X)[0] = _mm256_subs_epu8(((__m256i *) X)[0], m13);
- ((__m256i *) X)[1] = _mm256_subs_epu8(((__m256i *) X)[1], m13);
- }
- }
-
- renormalize(X, 210);
- unsigned int j;
- for(j=0; j < (framebits + excess) % 2; ++j) {
- int i;
- for(i=0;i<64/2;i++){
- BFLY(i, (((framebits+excess) >> 1) << 1) + j , syms, Y, X, (decision_t *)dec, Branchtab);
+ renormalize(X, 210);
+
+ unsigned int j;
+ for (j = 0; j < (framebits + excess) % 2; ++j) {
+ int i;
+ for (i = 0; i < 64 / 2; i++) {
+ BFLY(i,
+ (((framebits + excess) >> 1) << 1) + j,
+ syms,
+ Y,
+ X,
+ (decision_t*)dec,
+ Branchtab);
+ }
+
+ renormalize(Y, 210);
}
-
- renormalize(Y, 210);
-
- }
- /*skip*/
+ /*skip*/
}
#endif /*LV_HAVE_AVX2*/
#if LV_HAVE_SSE3
-#include <pmmintrin.h>
#include <emmintrin.h>
-#include <xmmintrin.h>
#include <mmintrin.h>
+#include <pmmintrin.h>
#include <stdio.h>
+#include <xmmintrin.h>
-static inline void
-volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y, unsigned char* X,
- unsigned char* syms, unsigned char* dec,
- unsigned int framebits, unsigned int excess,
- unsigned char* Branchtab)
+static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y,
+ unsigned char* X,
+ unsigned char* syms,
+ unsigned char* dec,
+ unsigned int framebits,
+ unsigned int excess,
+ unsigned char* Branchtab)
{
- unsigned int i9;
- for(i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
- unsigned char a75, a81;
- int a73, a92;
- short int s20, s21, s26, s27;
- unsigned char *a74, *a80, *b6;
- short int *a110, *a111, *a91, *a93, *a94;
- __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83
- , *a95, *a96, *a97, *a98, *a99;
- __m128i a105, a106, a86, a87;
- __m128i a100, a101, a103, a104, a107, a108, a109
- , a76, a78, a79, a82, a84, a85, a88, a89
- , a90, d10, d11, d12, d9, m23, m24, m25
- , m26, m27, m28, m29, m30, s18, s19, s22
- , s23, s24, s25, s28, s29, t13, t14, t15
- , t16, t17, t18;
- a71 = ((__m128i *) X);
- s18 = *(a71);
- a72 = (a71 + 2);
- s19 = *(a72);
- a73 = (4 * i9);
- a74 = (syms + a73);
- a75 = *(a74);
- a76 = _mm_set1_epi8(a75);
- a77 = ((__m128i *) Branchtab);
- a78 = *(a77);
- a79 = _mm_xor_si128(a76, a78);
- b6 = (a73 + syms);
- a80 = (b6 + 1);
- a81 = *(a80);
- a82 = _mm_set1_epi8(a81);
- a83 = (a77 + 2);
- a84 = *(a83);
- a85 = _mm_xor_si128(a82, a84);
- t13 = _mm_avg_epu8(a79,a85);
- a86 = ((__m128i ) t13);
- a87 = _mm_srli_epi16(a86, 2);
- a88 = ((__m128i ) a87);
- t14 = _mm_and_si128(a88, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
- , 63, 63, 63, 63, 63, 63, 63, 63
- , 63));
- t15 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
- , 63, 63, 63, 63, 63, 63, 63, 63
- , 63), t14);
- m23 = _mm_adds_epu8(s18, t14);
- m24 = _mm_adds_epu8(s19, t15);
- m25 = _mm_adds_epu8(s18, t15);
- m26 = _mm_adds_epu8(s19, t14);
- a89 = _mm_min_epu8(m24, m23);
- d9 = _mm_cmpeq_epi8(a89, m24);
- a90 = _mm_min_epu8(m26, m25);
- d10 = _mm_cmpeq_epi8(a90, m26);
- s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9,d10));
- a91 = ((short int *) dec);
- a92 = (8 * i9);
- a93 = (a91 + a92);
- *(a93) = s20;
- s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9,d10));
- a94 = (a93 + 1);
- *(a94) = s21;
- s22 = _mm_unpacklo_epi8(a89, a90);
- s23 = _mm_unpackhi_epi8(a89, a90);
- a95 = ((__m128i *) Y);
- *(a95) = s22;
- a96 = (a95 + 1);
- *(a96) = s23;
- a97 = (a71 + 1);
- s24 = *(a97);
- a98 = (a71 + 3);
- s25 = *(a98);
- a99 = (a77 + 1);
- a100 = *(a99);
- a101 = _mm_xor_si128(a76, a100);
- a102 = (a77 + 3);
- a103 = *(a102);
- a104 = _mm_xor_si128(a82, a103);
- t16 = _mm_avg_epu8(a101,a104);
- a105 = ((__m128i ) t16);
- a106 = _mm_srli_epi16(a105, 2);
- a107 = ((__m128i ) a106);
- t17 = _mm_and_si128(a107, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
- , 63, 63, 63, 63, 63, 63, 63, 63
- , 63));
- t18 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
- , 63, 63, 63, 63, 63, 63, 63, 63
- , 63), t17);
- m27 = _mm_adds_epu8(s24, t17);
- m28 = _mm_adds_epu8(s25, t18);
- m29 = _mm_adds_epu8(s24, t18);
- m30 = _mm_adds_epu8(s25, t17);
- a108 = _mm_min_epu8(m28, m27);
- d11 = _mm_cmpeq_epi8(a108, m28);
- a109 = _mm_min_epu8(m30, m29);
- d12 = _mm_cmpeq_epi8(a109, m30);
- s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11,d12));
- a110 = (a93 + 2);
- *(a110) = s26;
- s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11,d12));
- a111 = (a93 + 3);
- *(a111) = s27;
- s28 = _mm_unpacklo_epi8(a108, a109);
- s29 = _mm_unpackhi_epi8(a108, a109);
- a112 = (a95 + 2);
- *(a112) = s28;
- a113 = (a95 + 3);
- *(a113) = s29;
- if ((((unsigned char *) Y)[0]>210)) {
- __m128i m5, m6;
- m5 = ((__m128i *) Y)[0];
- m5 = _mm_min_epu8(m5, ((__m128i *) Y)[1]);
- m5 = _mm_min_epu8(m5, ((__m128i *) Y)[2]);
- m5 = _mm_min_epu8(m5, ((__m128i *) Y)[3]);
- __m128i m7;
- m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
- m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 32)), ((__m128i ) m7)));
- m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 16)), ((__m128i ) m7)));
- m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 8)), ((__m128i ) m7)));
- m7 = _mm_unpacklo_epi8(m7, m7);
- m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
- m6 = _mm_unpacklo_epi64(m7, m7);
- ((__m128i *) Y)[0] = _mm_subs_epu8(((__m128i *) Y)[0], m6);
- ((__m128i *) Y)[1] = _mm_subs_epu8(((__m128i *) Y)[1], m6);
- ((__m128i *) Y)[2] = _mm_subs_epu8(((__m128i *) Y)[2], m6);
- ((__m128i *) Y)[3] = _mm_subs_epu8(((__m128i *) Y)[3], m6);
- }
- unsigned char a188, a194;
- int a186, a205;
- short int s48, s49, s54, s55;
- unsigned char *a187, *a193, *b15;
- short int *a204, *a206, *a207, *a223, *a224, *b16;
- __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210
- , *a211, *a212, *a215, *a225, *a226;
- __m128i a199, a200, a218, a219;
- __m128i a189, a191, a192, a195, a197, a198, a201
- , a202, a203, a213, a214, a216, a217, a220, a221
- , a222, d17, d18, d19, d20, m39, m40, m41
- , m42, m43, m44, m45, m46, s46, s47, s50
- , s51, s52, s53, s56, s57, t25, t26, t27
- , t28, t29, t30;
- a184 = ((__m128i *) Y);
- s46 = *(a184);
- a185 = (a184 + 2);
- s47 = *(a185);
- a186 = (4 * i9);
- b15 = (a186 + syms);
- a187 = (b15 + 2);
- a188 = *(a187);
- a189 = _mm_set1_epi8(a188);
- a190 = ((__m128i *) Branchtab);
- a191 = *(a190);
- a192 = _mm_xor_si128(a189, a191);
- a193 = (b15 + 3);
- a194 = *(a193);
- a195 = _mm_set1_epi8(a194);
- a196 = (a190 + 2);
- a197 = *(a196);
- a198 = _mm_xor_si128(a195, a197);
- t25 = _mm_avg_epu8(a192,a198);
- a199 = ((__m128i ) t25);
- a200 = _mm_srli_epi16(a199, 2);
- a201 = ((__m128i ) a200);
- t26 = _mm_and_si128(a201, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
- , 63, 63, 63, 63, 63, 63, 63, 63
- , 63));
- t27 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
- , 63, 63, 63, 63, 63, 63, 63, 63
- , 63), t26);
- m39 = _mm_adds_epu8(s46, t26);
- m40 = _mm_adds_epu8(s47, t27);
- m41 = _mm_adds_epu8(s46, t27);
- m42 = _mm_adds_epu8(s47, t26);
- a202 = _mm_min_epu8(m40, m39);
- d17 = _mm_cmpeq_epi8(a202, m40);
- a203 = _mm_min_epu8(m42, m41);
- d18 = _mm_cmpeq_epi8(a203, m42);
- s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17,d18));
- a204 = ((short int *) dec);
- a205 = (8 * i9);
- b16 = (a204 + a205);
- a206 = (b16 + 4);
- *(a206) = s48;
- s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17,d18));
- a207 = (b16 + 5);
- *(a207) = s49;
- s50 = _mm_unpacklo_epi8(a202, a203);
- s51 = _mm_unpackhi_epi8(a202, a203);
- a208 = ((__m128i *) X);
- *(a208) = s50;
- a209 = (a208 + 1);
- *(a209) = s51;
- a210 = (a184 + 1);
- s52 = *(a210);
- a211 = (a184 + 3);
- s53 = *(a211);
- a212 = (a190 + 1);
- a213 = *(a212);
- a214 = _mm_xor_si128(a189, a213);
- a215 = (a190 + 3);
- a216 = *(a215);
- a217 = _mm_xor_si128(a195, a216);
- t28 = _mm_avg_epu8(a214,a217);
- a218 = ((__m128i ) t28);
- a219 = _mm_srli_epi16(a218, 2);
- a220 = ((__m128i ) a219);
- t29 = _mm_and_si128(a220, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
- , 63, 63, 63, 63, 63, 63, 63, 63
- , 63));
- t30 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
- , 63, 63, 63, 63, 63, 63, 63, 63
- , 63), t29);
- m43 = _mm_adds_epu8(s52, t29);
- m44 = _mm_adds_epu8(s53, t30);
- m45 = _mm_adds_epu8(s52, t30);
- m46 = _mm_adds_epu8(s53, t29);
- a221 = _mm_min_epu8(m44, m43);
- d19 = _mm_cmpeq_epi8(a221, m44);
- a222 = _mm_min_epu8(m46, m45);
- d20 = _mm_cmpeq_epi8(a222, m46);
- s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19,d20));
- a223 = (b16 + 6);
- *(a223) = s54;
- s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19,d20));
- a224 = (b16 + 7);
- *(a224) = s55;
- s56 = _mm_unpacklo_epi8(a221, a222);
- s57 = _mm_unpackhi_epi8(a221, a222);
- a225 = (a208 + 2);
- *(a225) = s56;
- a226 = (a208 + 3);
- *(a226) = s57;
- if ((((unsigned char *) X)[0]>210)) {
- __m128i m12, m13;
- m12 = ((__m128i *) X)[0];
- m12 = _mm_min_epu8(m12, ((__m128i *) X)[1]);
- m12 = _mm_min_epu8(m12, ((__m128i *) X)[2]);
- m12 = _mm_min_epu8(m12, ((__m128i *) X)[3]);
- __m128i m14;
- m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
- m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 32)), ((__m128i ) m14)));
- m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 16)), ((__m128i ) m14)));
- m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 8)), ((__m128i ) m14)));
- m14 = _mm_unpacklo_epi8(m14, m14);
- m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
- m13 = _mm_unpacklo_epi64(m14, m14);
- ((__m128i *) X)[0] = _mm_subs_epu8(((__m128i *) X)[0], m13);
- ((__m128i *) X)[1] = _mm_subs_epu8(((__m128i *) X)[1], m13);
- ((__m128i *) X)[2] = _mm_subs_epu8(((__m128i *) X)[2], m13);
- ((__m128i *) X)[3] = _mm_subs_epu8(((__m128i *) X)[3], m13);
+ unsigned int i9;
+ for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
+ unsigned char a75, a81;
+ int a73, a92;
+ short int s20, s21, s26, s27;
+ unsigned char *a74, *a80, *b6;
+ short int *a110, *a111, *a91, *a93, *a94;
+ __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
+ __m128i a105, a106, a86, a87;
+ __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
+ a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
+ s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
+ a71 = ((__m128i*)X);
+ s18 = *(a71);
+ a72 = (a71 + 2);
+ s19 = *(a72);
+ a73 = (4 * i9);
+ a74 = (syms + a73);
+ a75 = *(a74);
+ a76 = _mm_set1_epi8(a75);
+ a77 = ((__m128i*)Branchtab);
+ a78 = *(a77);
+ a79 = _mm_xor_si128(a76, a78);
+ b6 = (a73 + syms);
+ a80 = (b6 + 1);
+ a81 = *(a80);
+ a82 = _mm_set1_epi8(a81);
+ a83 = (a77 + 2);
+ a84 = *(a83);
+ a85 = _mm_xor_si128(a82, a84);
+ t13 = _mm_avg_epu8(a79, a85);
+ a86 = ((__m128i)t13);
+ a87 = _mm_srli_epi16(a86, 2);
+ a88 = ((__m128i)a87);
+ t14 = _mm_and_si128(
+ a88,
+ _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
+ t15 = _mm_subs_epu8(
+ _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
+ t14);
+ m23 = _mm_adds_epu8(s18, t14);
+ m24 = _mm_adds_epu8(s19, t15);
+ m25 = _mm_adds_epu8(s18, t15);
+ m26 = _mm_adds_epu8(s19, t14);
+ a89 = _mm_min_epu8(m24, m23);
+ d9 = _mm_cmpeq_epi8(a89, m24);
+ a90 = _mm_min_epu8(m26, m25);
+ d10 = _mm_cmpeq_epi8(a90, m26);
+ s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
+ a91 = ((short int*)dec);
+ a92 = (8 * i9);
+ a93 = (a91 + a92);
+ *(a93) = s20;
+ s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
+ a94 = (a93 + 1);
+ *(a94) = s21;
+ s22 = _mm_unpacklo_epi8(a89, a90);
+ s23 = _mm_unpackhi_epi8(a89, a90);
+ a95 = ((__m128i*)Y);
+ *(a95) = s22;
+ a96 = (a95 + 1);
+ *(a96) = s23;
+ a97 = (a71 + 1);
+ s24 = *(a97);
+ a98 = (a71 + 3);
+ s25 = *(a98);
+ a99 = (a77 + 1);
+ a100 = *(a99);
+ a101 = _mm_xor_si128(a76, a100);
+ a102 = (a77 + 3);
+ a103 = *(a102);
+ a104 = _mm_xor_si128(a82, a103);
+ t16 = _mm_avg_epu8(a101, a104);
+ a105 = ((__m128i)t16);
+ a106 = _mm_srli_epi16(a105, 2);
+ a107 = ((__m128i)a106);
+ t17 = _mm_and_si128(
+ a107,
+ _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
+ t18 = _mm_subs_epu8(
+ _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
+ t17);
+ m27 = _mm_adds_epu8(s24, t17);
+ m28 = _mm_adds_epu8(s25, t18);
+ m29 = _mm_adds_epu8(s24, t18);
+ m30 = _mm_adds_epu8(s25, t17);
+ a108 = _mm_min_epu8(m28, m27);
+ d11 = _mm_cmpeq_epi8(a108, m28);
+ a109 = _mm_min_epu8(m30, m29);
+ d12 = _mm_cmpeq_epi8(a109, m30);
+ s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
+ a110 = (a93 + 2);
+ *(a110) = s26;
+ s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
+ a111 = (a93 + 3);
+ *(a111) = s27;
+ s28 = _mm_unpacklo_epi8(a108, a109);
+ s29 = _mm_unpackhi_epi8(a108, a109);
+ a112 = (a95 + 2);
+ *(a112) = s28;
+ a113 = (a95 + 3);
+ *(a113) = s29;
+ if ((((unsigned char*)Y)[0] > 210)) {
+ __m128i m5, m6;
+ m5 = ((__m128i*)Y)[0];
+ m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
+ m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
+ m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
+ __m128i m7;
+ m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
+ m7 =
+ ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
+ m7 =
+ ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
+ m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
+ m7 = _mm_unpacklo_epi8(m7, m7);
+ m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
+ m6 = _mm_unpacklo_epi64(m7, m7);
+ ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
+ ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
+ ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
+ ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
+ }
+ unsigned char a188, a194;
+ int a186, a205;
+ short int s48, s49, s54, s55;
+ unsigned char *a187, *a193, *b15;
+ short int *a204, *a206, *a207, *a223, *a224, *b16;
+ __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
+ *a225, *a226;
+ __m128i a199, a200, a218, a219;
+ __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
+ a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45,
+ m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30;
+ a184 = ((__m128i*)Y);
+ s46 = *(a184);
+ a185 = (a184 + 2);
+ s47 = *(a185);
+ a186 = (4 * i9);
+ b15 = (a186 + syms);
+ a187 = (b15 + 2);
+ a188 = *(a187);
+ a189 = _mm_set1_epi8(a188);
+ a190 = ((__m128i*)Branchtab);
+ a191 = *(a190);
+ a192 = _mm_xor_si128(a189, a191);
+ a193 = (b15 + 3);
+ a194 = *(a193);
+ a195 = _mm_set1_epi8(a194);
+ a196 = (a190 + 2);
+ a197 = *(a196);
+ a198 = _mm_xor_si128(a195, a197);
+ t25 = _mm_avg_epu8(a192, a198);
+ a199 = ((__m128i)t25);
+ a200 = _mm_srli_epi16(a199, 2);
+ a201 = ((__m128i)a200);
+ t26 = _mm_and_si128(
+ a201,
+ _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
+ t27 = _mm_subs_epu8(
+ _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
+ t26);
+ m39 = _mm_adds_epu8(s46, t26);
+ m40 = _mm_adds_epu8(s47, t27);
+ m41 = _mm_adds_epu8(s46, t27);
+ m42 = _mm_adds_epu8(s47, t26);
+ a202 = _mm_min_epu8(m40, m39);
+ d17 = _mm_cmpeq_epi8(a202, m40);
+ a203 = _mm_min_epu8(m42, m41);
+ d18 = _mm_cmpeq_epi8(a203, m42);
+ s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18));
+ a204 = ((short int*)dec);
+ a205 = (8 * i9);
+ b16 = (a204 + a205);
+ a206 = (b16 + 4);
+ *(a206) = s48;
+ s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18));
+ a207 = (b16 + 5);
+ *(a207) = s49;
+ s50 = _mm_unpacklo_epi8(a202, a203);
+ s51 = _mm_unpackhi_epi8(a202, a203);
+ a208 = ((__m128i*)X);
+ *(a208) = s50;
+ a209 = (a208 + 1);
+ *(a209) = s51;
+ a210 = (a184 + 1);
+ s52 = *(a210);
+ a211 = (a184 + 3);
+ s53 = *(a211);
+ a212 = (a190 + 1);
+ a213 = *(a212);
+ a214 = _mm_xor_si128(a189, a213);
+ a215 = (a190 + 3);
+ a216 = *(a215);
+ a217 = _mm_xor_si128(a195, a216);
+ t28 = _mm_avg_epu8(a214, a217);
+ a218 = ((__m128i)t28);
+ a219 = _mm_srli_epi16(a218, 2);
+ a220 = ((__m128i)a219);
+ t29 = _mm_and_si128(
+ a220,
+ _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
+ t30 = _mm_subs_epu8(
+ _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
+ t29);
+ m43 = _mm_adds_epu8(s52, t29);
+ m44 = _mm_adds_epu8(s53, t30);
+ m45 = _mm_adds_epu8(s52, t30);
+ m46 = _mm_adds_epu8(s53, t29);
+ a221 = _mm_min_epu8(m44, m43);
+ d19 = _mm_cmpeq_epi8(a221, m44);
+ a222 = _mm_min_epu8(m46, m45);
+ d20 = _mm_cmpeq_epi8(a222, m46);
+ s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20));
+ a223 = (b16 + 6);
+ *(a223) = s54;
+ s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20));
+ a224 = (b16 + 7);
+ *(a224) = s55;
+ s56 = _mm_unpacklo_epi8(a221, a222);
+ s57 = _mm_unpackhi_epi8(a221, a222);
+ a225 = (a208 + 2);
+ *(a225) = s56;
+ a226 = (a208 + 3);
+ *(a226) = s57;
+ if ((((unsigned char*)X)[0] > 210)) {
+ __m128i m12, m13;
+ m12 = ((__m128i*)X)[0];
+ m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
+ m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
+ m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
+ __m128i m14;
+ m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
+ m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)),
+ ((__m128i)m14)));
+ m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)),
+ ((__m128i)m14)));
+ m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)),
+ ((__m128i)m14)));
+ m14 = _mm_unpacklo_epi8(m14, m14);
+ m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
+ m13 = _mm_unpacklo_epi64(m14, m14);
+ ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
+ ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
+ ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
+ ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
+ }
}
- }
-
- renormalize(X, 210);
- /*int ch;
- for(ch = 0; ch < 64; ch++) {
- printf("%d,", X[ch]);
- }
- printf("\n");*/
-
- unsigned int j;
- for(j=0; j < (framebits + excess) % 2; ++j) {
- int i;
- for(i=0;i<64/2;i++){
- BFLY(i, (((framebits+excess) >> 1) << 1) + j , syms, Y, X, (decision_t *)dec, Branchtab);
- }
+ renormalize(X, 210);
-
- renormalize(Y, 210);
-
- /*printf("\n");
+ /*int ch;
for(ch = 0; ch < 64; ch++) {
- printf("%d,", Y[ch]);
+ printf("%d,", X[ch]);
}
printf("\n");*/
- }
- /*skip*/
+ unsigned int j;
+ for (j = 0; j < (framebits + excess) % 2; ++j) {
+ int i;
+ for (i = 0; i < 64 / 2; i++) {
+ BFLY(i,
+ (((framebits + excess) >> 1) << 1) + j,
+ syms,
+ Y,
+ X,
+ (decision_t*)dec,
+ Branchtab);
+ }
+
+
+ renormalize(Y, 210);
+
+ /*printf("\n");
+ for(ch = 0; ch < 64; ch++) {
+ printf("%d,", Y[ch]);
+ }
+ printf("\n");*/
+ }
+ /*skip*/
}
#endif /*LV_HAVE_SSE3*/
#if LV_HAVE_GENERIC
-static inline void
-volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y, unsigned char* X,
- unsigned char* syms, unsigned char* dec,
- unsigned int framebits, unsigned int excess,
- unsigned char* Branchtab)
+static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y,
+ unsigned char* X,
+ unsigned char* syms,
+ unsigned char* dec,
+ unsigned int framebits,
+ unsigned int excess,
+ unsigned char* Branchtab)
{
- int nbits = framebits + excess;
- int NUMSTATES = 64;
- int RENORMALIZE_THRESHOLD = 210;
-
- int s,i;
- for (s=0;s<nbits;s++){
- void *tmp;
- for(i=0;i<NUMSTATES/2;i++){
- BFLY(i, s, syms, Y, X, (decision_t *)dec, Branchtab);
+ int nbits = framebits + excess;
+ int NUMSTATES = 64;
+ int RENORMALIZE_THRESHOLD = 210;
+
+ int s, i;
+ for (s = 0; s < nbits; s++) {
+ void* tmp;
+ for (i = 0; i < NUMSTATES / 2; i++) {
+ BFLY(i, s, syms, Y, X, (decision_t*)dec, Branchtab);
+ }
+
+ renormalize(Y, RENORMALIZE_THRESHOLD);
+
+ /// Swap pointers to old and new metrics
+ tmp = (void*)X;
+ X = Y;
+ Y = (unsigned char*)tmp;
}
-
- renormalize(Y, RENORMALIZE_THRESHOLD);
-
- /// Swap pointers to old and new metrics
- tmp = (void *)X;
- X = Y;
- Y = (unsigned char*)tmp;
- }
}
#endif /* LV_HAVE_GENERIC */
// for puppets we need to get all the func_variants for the puppet and just
// keep track of the actual function name to write to results
-#define VOLK_INIT_PUPP(func, puppet_master_func, test_params)\
- volk_test_case_t(func##_get_func_desc(), (void(*)())func##_manual, std::string(#func),\
- std::string(#puppet_master_func), test_params)
+#define VOLK_INIT_PUPP(func, puppet_master_func, test_params) \
+ volk_test_case_t(func##_get_func_desc(), \
+ (void (*)())func##_manual, \
+ std::string(#func), \
+ std::string(#puppet_master_func), \
+ test_params)
-#define VOLK_INIT_TEST(func, test_params)\
- volk_test_case_t(func##_get_func_desc(), (void(*)())func##_manual, std::string(#func),\
- test_params)
+#define VOLK_INIT_TEST(func, test_params) \
+ volk_test_case_t(func##_get_func_desc(), \
+ (void (*)())func##_manual, \
+ std::string(#func), \
+ test_params)
#define QA(test) test_cases.push_back(test);
std::vector<volk_test_case_t> init_test_list(volk_test_params_t test_params)
test_params_rotator.set_tol(1e-3);
std::vector<volk_test_case_t> test_cases;
- QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params))
- QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params))
- QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params))
+ QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params))
+ QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params))
+ QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params))
QA(VOLK_INIT_PUPP(volk_16u_byteswappuppet_16u, volk_16u_byteswap, test_params))
QA(VOLK_INIT_PUPP(volk_32u_byteswappuppet_32u, volk_32u_byteswap, test_params))
- QA(VOLK_INIT_PUPP(volk_32u_popcntpuppet_32u, volk_32u_popcnt_32u, test_params))
+ QA(VOLK_INIT_PUPP(volk_32u_popcntpuppet_32u, volk_32u_popcnt_32u, test_params))
QA(VOLK_INIT_PUPP(volk_64u_byteswappuppet_64u, volk_64u_byteswap, test_params))
- QA(VOLK_INIT_PUPP(volk_32fc_s32fc_rotatorpuppet_32fc, volk_32fc_s32fc_x2_rotator_32fc, test_params_rotator))
- QA(VOLK_INIT_PUPP(volk_8u_conv_k7_r2puppet_8u, volk_8u_x4_conv_k7_r2_8u, test_params.make_tol(0)))
- QA(VOLK_INIT_PUPP(volk_32f_x2_fm_detectpuppet_32f, volk_32f_s32f_32f_fm_detect_32f, test_params))
- QA(VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_real_32f, test_params))
- QA(VOLK_INIT_TEST(volk_16ic_deinterleave_real_8i, test_params))
- QA(VOLK_INIT_TEST(volk_16ic_deinterleave_16i_x2, test_params))
- QA(VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_32f_x2, test_params))
- QA(VOLK_INIT_TEST(volk_16ic_deinterleave_real_16i, test_params))
- QA(VOLK_INIT_TEST(volk_16ic_magnitude_16i, test_params))
- QA(VOLK_INIT_TEST(volk_16ic_s32f_magnitude_32f, test_params))
- QA(VOLK_INIT_TEST(volk_16ic_convert_32fc, test_params))
- QA(VOLK_INIT_TEST(volk_16ic_x2_multiply_16ic, test_params))
- QA(VOLK_INIT_TEST(volk_16ic_x2_dot_prod_16ic, test_params))
- QA(VOLK_INIT_TEST(volk_16i_s32f_convert_32f, test_params))
- QA(VOLK_INIT_TEST(volk_16i_convert_8i, test_params))
- QA(VOLK_INIT_TEST(volk_16i_32fc_dot_prod_32fc, test_params_inacc))
- QA(VOLK_INIT_TEST(volk_32f_accumulator_s32f, test_params_inacc))
- QA(VOLK_INIT_TEST(volk_32f_x2_add_32f, test_params))
- QA(VOLK_INIT_TEST(volk_32f_index_max_16u, test_params))
- QA(VOLK_INIT_TEST(volk_32f_index_max_32u, test_params))
- QA(VOLK_INIT_TEST(volk_32fc_32f_multiply_32fc, test_params))
- QA(VOLK_INIT_TEST(volk_32fc_32f_add_32fc, test_params))
- QA(VOLK_INIT_TEST(volk_32f_log2_32f, test_params.make_absolute(1e-5)))
- QA(VOLK_INIT_TEST(volk_32f_expfast_32f, test_params_inacc_tenth))
- QA(VOLK_INIT_TEST(volk_32f_x2_pow_32f, test_params_inacc))
- QA(VOLK_INIT_TEST(volk_32f_sin_32f, test_params_inacc))
- QA(VOLK_INIT_TEST(volk_32f_cos_32f, test_params_inacc))
- QA(VOLK_INIT_TEST(volk_32f_tan_32f, test_params_inacc))
- QA(VOLK_INIT_TEST(volk_32f_atan_32f, test_params_inacc))
- QA(VOLK_INIT_TEST(volk_32f_asin_32f, test_params_inacc))
- QA(VOLK_INIT_TEST(volk_32f_acos_32f, test_params_inacc))
- QA(VOLK_INIT_TEST(volk_32fc_s32f_power_32fc, test_params_power))
- QA(VOLK_INIT_TEST(volk_32f_s32f_calc_spectral_noise_floor_32f, test_params_inacc))
- QA(VOLK_INIT_TEST(volk_32fc_s32f_atan2_32f, test_params))
- QA(VOLK_INIT_TEST(volk_32fc_x2_conjugate_dot_prod_32fc, test_params_inacc_tenth))
- QA(VOLK_INIT_TEST(volk_32fc_deinterleave_32f_x2, test_params))
- QA(VOLK_INIT_TEST(volk_32fc_deinterleave_64f_x2, test_params))
- QA(VOLK_INIT_TEST(volk_32fc_s32f_deinterleave_real_16i, test_params))
- QA(VOLK_INIT_TEST(volk_32fc_deinterleave_imag_32f, test_params))
- QA(VOLK_INIT_TEST(volk_32fc_deinterleave_real_32f, test_params))
- QA(VOLK_INIT_TEST(volk_32fc_deinterleave_real_64f, test_params))
- QA(VOLK_INIT_TEST(volk_32fc_x2_dot_prod_32fc, test_params_inacc))
- QA(VOLK_INIT_TEST(volk_32fc_32f_dot_prod_32fc, test_params_inacc))
- QA(VOLK_INIT_TEST(volk_32fc_index_max_16u, test_params))
- QA(VOLK_INIT_TEST(volk_32fc_index_max_32u, test_params))
- QA(VOLK_INIT_TEST(volk_32fc_s32f_magnitude_16i, test_params))
- QA(VOLK_INIT_TEST(volk_32fc_magnitude_32f, test_params_inacc_tenth))
- QA(VOLK_INIT_TEST(volk_32fc_magnitude_squared_32f, test_params))
- QA(VOLK_INIT_TEST(volk_32fc_x2_add_32fc, test_params))
- QA(VOLK_INIT_TEST(volk_32fc_x2_multiply_32fc, test_params))
- QA(VOLK_INIT_TEST(volk_32fc_x2_multiply_conjugate_32fc, test_params))
- QA(VOLK_INIT_TEST(volk_32fc_x2_divide_32fc, test_params))
- QA(VOLK_INIT_TEST(volk_32fc_conjugate_32fc, test_params))
- QA(VOLK_INIT_TEST(volk_32f_s32f_convert_16i, test_params))
- QA(VOLK_INIT_TEST(volk_32f_s32f_convert_32i, test_params))
- QA(VOLK_INIT_TEST(volk_32f_convert_64f, test_params))
- QA(VOLK_INIT_TEST(volk_32f_s32f_convert_8i, test_params))
- QA(VOLK_INIT_TEST(volk_32fc_convert_16ic, test_params))
- QA(VOLK_INIT_TEST(volk_32fc_s32f_power_spectrum_32f, test_params))
- QA(VOLK_INIT_TEST(volk_32fc_x2_square_dist_32f, test_params))
- QA(VOLK_INIT_TEST(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, test_params))
- QA(VOLK_INIT_TEST(volk_32f_x2_divide_32f, test_params))
- QA(VOLK_INIT_TEST(volk_32f_x2_dot_prod_32f, test_params_inacc))
- QA(VOLK_INIT_TEST(volk_32f_x2_s32f_interleave_16ic, test_params))
- QA(VOLK_INIT_TEST(volk_32f_x2_interleave_32fc, test_params))
- QA(VOLK_INIT_TEST(volk_32f_x2_max_32f, test_params))
- QA(VOLK_INIT_TEST(volk_32f_x2_min_32f, test_params))
- QA(VOLK_INIT_TEST(volk_32f_x2_multiply_32f, test_params))
- QA(VOLK_INIT_TEST(volk_32f_64f_multiply_64f, test_params))
- QA(VOLK_INIT_TEST(volk_32f_64f_add_64f, test_params))
- QA(VOLK_INIT_TEST(volk_32f_s32f_normalize, test_params))
- QA(VOLK_INIT_TEST(volk_32f_s32f_power_32f, test_params))
- QA(VOLK_INIT_TEST(volk_32f_sqrt_32f, test_params_inacc))
- QA(VOLK_INIT_TEST(volk_32f_s32f_stddev_32f, test_params_inacc))
- QA(VOLK_INIT_TEST(volk_32f_stddev_and_mean_32f_x2, test_params_inacc))
- QA(VOLK_INIT_TEST(volk_32f_x2_subtract_32f, test_params))
- QA(VOLK_INIT_TEST(volk_32f_x3_sum_of_poly_32f, test_params_inacc))
- QA(VOLK_INIT_TEST(volk_32i_x2_and_32i, test_params))
- QA(VOLK_INIT_TEST(volk_32i_s32f_convert_32f, test_params))
- QA(VOLK_INIT_TEST(volk_32i_x2_or_32i, test_params))
- QA(VOLK_INIT_TEST(volk_32f_x2_dot_prod_16i, test_params))
- QA(VOLK_INIT_TEST(volk_64f_convert_32f, test_params))
- QA(VOLK_INIT_TEST(volk_64f_x2_max_64f, test_params))
- QA(VOLK_INIT_TEST(volk_64f_x2_min_64f, test_params))
- QA(VOLK_INIT_TEST(volk_64f_x2_multiply_64f, test_params))
- QA(VOLK_INIT_TEST(volk_64f_x2_add_64f, test_params))
- QA(VOLK_INIT_TEST(volk_8ic_deinterleave_16i_x2, test_params))
- QA(VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_32f_x2, test_params))
- QA(VOLK_INIT_TEST(volk_8ic_deinterleave_real_16i, test_params))
- QA(VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_real_32f, test_params))
- QA(VOLK_INIT_TEST(volk_8ic_deinterleave_real_8i, test_params))
- QA(VOLK_INIT_TEST(volk_8ic_x2_multiply_conjugate_16ic, test_params))
- QA(VOLK_INIT_TEST(volk_8ic_x2_s32f_multiply_conjugate_32fc, test_params))
- QA(VOLK_INIT_TEST(volk_8i_convert_16i, test_params))
- QA(VOLK_INIT_TEST(volk_8i_s32f_convert_32f, test_params))
- QA(VOLK_INIT_TEST(volk_32fc_s32fc_multiply_32fc, test_params))
- QA(VOLK_INIT_TEST(volk_32f_s32f_multiply_32f, test_params))
- QA(VOLK_INIT_TEST(volk_32f_binary_slicer_32i, test_params))
- QA(VOLK_INIT_TEST(volk_32f_binary_slicer_8i, test_params))
- QA(VOLK_INIT_TEST(volk_32u_reverse_32u, test_params))
- QA(VOLK_INIT_TEST(volk_32f_tanh_32f, test_params_inacc))
- QA(VOLK_INIT_TEST(volk_32f_s32f_mod_rangepuppet_32f, test_params))
+ QA(VOLK_INIT_PUPP(volk_32fc_s32fc_rotatorpuppet_32fc,
+ volk_32fc_s32fc_x2_rotator_32fc,
+ test_params_rotator))
+ QA(VOLK_INIT_PUPP(
+ volk_8u_conv_k7_r2puppet_8u, volk_8u_x4_conv_k7_r2_8u, test_params.make_tol(0)))
+ QA(VOLK_INIT_PUPP(
+ volk_32f_x2_fm_detectpuppet_32f, volk_32f_s32f_32f_fm_detect_32f, test_params))
+ QA(VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_real_32f, test_params))
+ QA(VOLK_INIT_TEST(volk_16ic_deinterleave_real_8i, test_params))
+ QA(VOLK_INIT_TEST(volk_16ic_deinterleave_16i_x2, test_params))
+ QA(VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_32f_x2, test_params))
+ QA(VOLK_INIT_TEST(volk_16ic_deinterleave_real_16i, test_params))
+ QA(VOLK_INIT_TEST(volk_16ic_magnitude_16i, test_params))
+ QA(VOLK_INIT_TEST(volk_16ic_s32f_magnitude_32f, test_params))
+ QA(VOLK_INIT_TEST(volk_16ic_convert_32fc, test_params))
+ QA(VOLK_INIT_TEST(volk_16ic_x2_multiply_16ic, test_params))
+ QA(VOLK_INIT_TEST(volk_16ic_x2_dot_prod_16ic, test_params))
+ QA(VOLK_INIT_TEST(volk_16i_s32f_convert_32f, test_params))
+ QA(VOLK_INIT_TEST(volk_16i_convert_8i, test_params))
+ QA(VOLK_INIT_TEST(volk_16i_32fc_dot_prod_32fc, test_params_inacc))
+ QA(VOLK_INIT_TEST(volk_32f_accumulator_s32f, test_params_inacc))
+ QA(VOLK_INIT_TEST(volk_32f_x2_add_32f, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_index_max_16u, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_index_max_32u, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_32f_multiply_32fc, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_32f_add_32fc, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_log2_32f, test_params.make_absolute(1e-5)))
+ QA(VOLK_INIT_TEST(volk_32f_expfast_32f, test_params_inacc_tenth))
+ QA(VOLK_INIT_TEST(volk_32f_x2_pow_32f, test_params_inacc))
+ QA(VOLK_INIT_TEST(volk_32f_sin_32f, test_params_inacc))
+ QA(VOLK_INIT_TEST(volk_32f_cos_32f, test_params_inacc))
+ QA(VOLK_INIT_TEST(volk_32f_tan_32f, test_params_inacc))
+ QA(VOLK_INIT_TEST(volk_32f_atan_32f, test_params_inacc))
+ QA(VOLK_INIT_TEST(volk_32f_asin_32f, test_params_inacc))
+ QA(VOLK_INIT_TEST(volk_32f_acos_32f, test_params_inacc))
+ QA(VOLK_INIT_TEST(volk_32fc_s32f_power_32fc, test_params_power))
+ QA(VOLK_INIT_TEST(volk_32f_s32f_calc_spectral_noise_floor_32f, test_params_inacc))
+ QA(VOLK_INIT_TEST(volk_32fc_s32f_atan2_32f, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_x2_conjugate_dot_prod_32fc, test_params_inacc_tenth))
+ QA(VOLK_INIT_TEST(volk_32fc_deinterleave_32f_x2, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_deinterleave_64f_x2, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_s32f_deinterleave_real_16i, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_deinterleave_imag_32f, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_deinterleave_real_32f, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_deinterleave_real_64f, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_x2_dot_prod_32fc, test_params_inacc))
+ QA(VOLK_INIT_TEST(volk_32fc_32f_dot_prod_32fc, test_params_inacc))
+ QA(VOLK_INIT_TEST(volk_32fc_index_max_16u, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_index_max_32u, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_s32f_magnitude_16i, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_magnitude_32f, test_params_inacc_tenth))
+ QA(VOLK_INIT_TEST(volk_32fc_magnitude_squared_32f, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_x2_add_32fc, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_x2_multiply_32fc, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_x2_multiply_conjugate_32fc, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_x2_divide_32fc, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_conjugate_32fc, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_s32f_convert_16i, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_s32f_convert_32i, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_convert_64f, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_s32f_convert_8i, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_convert_16ic, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_s32f_power_spectrum_32f, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_x2_square_dist_32f, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_x2_divide_32f, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_x2_dot_prod_32f, test_params_inacc))
+ QA(VOLK_INIT_TEST(volk_32f_x2_s32f_interleave_16ic, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_x2_interleave_32fc, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_x2_max_32f, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_x2_min_32f, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_x2_multiply_32f, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_64f_multiply_64f, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_64f_add_64f, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_s32f_normalize, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_s32f_power_32f, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_sqrt_32f, test_params_inacc))
+ QA(VOLK_INIT_TEST(volk_32f_s32f_stddev_32f, test_params_inacc))
+ QA(VOLK_INIT_TEST(volk_32f_stddev_and_mean_32f_x2, test_params_inacc))
+ QA(VOLK_INIT_TEST(volk_32f_x2_subtract_32f, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_x3_sum_of_poly_32f, test_params_inacc))
+ QA(VOLK_INIT_TEST(volk_32i_x2_and_32i, test_params))
+ QA(VOLK_INIT_TEST(volk_32i_s32f_convert_32f, test_params))
+ QA(VOLK_INIT_TEST(volk_32i_x2_or_32i, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_x2_dot_prod_16i, test_params))
+ QA(VOLK_INIT_TEST(volk_64f_convert_32f, test_params))
+ QA(VOLK_INIT_TEST(volk_64f_x2_max_64f, test_params))
+ QA(VOLK_INIT_TEST(volk_64f_x2_min_64f, test_params))
+ QA(VOLK_INIT_TEST(volk_64f_x2_multiply_64f, test_params))
+ QA(VOLK_INIT_TEST(volk_64f_x2_add_64f, test_params))
+ QA(VOLK_INIT_TEST(volk_8ic_deinterleave_16i_x2, test_params))
+ QA(VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_32f_x2, test_params))
+ QA(VOLK_INIT_TEST(volk_8ic_deinterleave_real_16i, test_params))
+ QA(VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_real_32f, test_params))
+ QA(VOLK_INIT_TEST(volk_8ic_deinterleave_real_8i, test_params))
+ QA(VOLK_INIT_TEST(volk_8ic_x2_multiply_conjugate_16ic, test_params))
+ QA(VOLK_INIT_TEST(volk_8ic_x2_s32f_multiply_conjugate_32fc, test_params))
+ QA(VOLK_INIT_TEST(volk_8i_convert_16i, test_params))
+ QA(VOLK_INIT_TEST(volk_8i_s32f_convert_32f, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_s32fc_multiply_32fc, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_s32f_multiply_32f, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_binary_slicer_32i, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_binary_slicer_8i, test_params))
+ QA(VOLK_INIT_TEST(volk_32u_reverse_32u, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_tanh_32f, test_params_inacc))
+ QA(VOLK_INIT_TEST(volk_32f_s32f_mod_rangepuppet_32f, test_params))
QA(VOLK_INIT_TEST(volk_32fc_x2_s32fc_multiply_conjugate_add_32fc, test_params))
- QA(VOLK_INIT_PUPP(volk_8u_x3_encodepolarpuppet_8u, volk_8u_x3_encodepolar_8u_x2, test_params))
- QA(VOLK_INIT_PUPP(volk_32f_8u_polarbutterflypuppet_32f, volk_32f_8u_polarbutterfly_32f, test_params))
- QA(VOLK_INIT_TEST(volk_32f_exp_32f, test_params))
-
+ QA(VOLK_INIT_PUPP(
+ volk_8u_x3_encodepolarpuppet_8u, volk_8u_x3_encodepolar_8u_x2, test_params))
+ QA(VOLK_INIT_PUPP(volk_32f_8u_polarbutterflypuppet_32f,
+ volk_32f_8u_polarbutterfly_32f,
+ test_params))
// no one uses these, so don't test them
- //VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex);
- //VOLK_PROFILE(volk_16i_branch_4_state_8, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex);
- //VOLK_PROFILE(volk_16i_max_star_16i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
- //VOLK_PROFILE(volk_16i_max_star_horizontal_16i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
- //VOLK_PROFILE(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
- //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
+ // VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results,
+ // benchmark_mode, kernel_regex); VOLK_PROFILE(volk_16i_branch_4_state_8, 1e-4, 2046,
+ // 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_16i_max_star_16i,
+ // 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
+ // VOLK_PROFILE(volk_16i_max_star_horizontal_16i, 0, 0, 204602, 10000, &results,
+ // benchmark_mode, kernel_regex); VOLK_PROFILE(volk_16i_permute_and_scalar_add, 1e-4,
+ // 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
+ // VOLK_PROFILE(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results,
+ // benchmark_mode, kernel_regex);
// we need a puppet for this one
//(VOLK_INIT_TEST(volk_32fc_s32f_x2_power_spectral_density_32f, test_params))
-#include <volk/volk.h>
#include "qa_utils.h"
+#include <volk/volk.h>
-#include <volk/volk.h> // for volk_func_desc_t
-#include <volk/volk_malloc.h> // for volk_free, volk_m...
+#include <volk/volk.h> // for volk_func_desc_t
+#include <volk/volk_malloc.h> // for volk_free, volk_m...
-#include <assert.h> // for assert
-#include <stdint.h> // for uint16_t, uint64_t
-#include <sys/time.h> // for CLOCKS_PER_SEC
-#include <sys/types.h> // for int16_t, int32_t
+#include <assert.h> // for assert
+#include <stdint.h> // for uint16_t, uint64_t
+#include <sys/time.h> // for CLOCKS_PER_SEC
+#include <sys/types.h> // for int16_t, int32_t
#include <chrono>
-#include <cmath> // for sqrt, fabs, abs
-#include <cstring> // for memcpy, memset
-#include <ctime> // for clock
-#include <fstream> // for operator<<, basic...
-#include <iostream> // for cout, cerr
-#include <limits> // for numeric_limits
-#include <map> // for map, map<>::mappe...
+#include <cmath> // for sqrt, fabs, abs
+#include <cstring> // for memcpy, memset
+#include <ctime> // for clock
+#include <fstream> // for operator<<, basic...
+#include <iostream> // for cout, cerr
+#include <limits> // for numeric_limits
+#include <map> // for map, map<>::mappe...
#include <random>
-#include <vector> // for vector, _Bit_refe...
+#include <vector> // for vector, _Bit_refe...
template <typename T>
-void random_floats(void *buf, unsigned int n, std::default_random_engine& rnd_engine)
+void random_floats(void* buf, unsigned int n, std::default_random_engine& rnd_engine)
{
- T *array = static_cast<T*>(buf);
+ T* array = static_cast<T*>(buf);
std::uniform_real_distribution<T> uniform_dist(T(-1), T(1));
- for(unsigned int i = 0; i < n; i++) {
+ for (unsigned int i = 0; i < n; i++) {
array[i] = uniform_dist(rnd_engine);
}
}
-void load_random_data(void *data, volk_type_t type, unsigned int n) {
+void load_random_data(void* data, volk_type_t type, unsigned int n)
+{
std::random_device rnd_device;
std::default_random_engine rnd_engine(rnd_device());
- if(type.is_complex) n *= 2;
- if(type.is_float) {
- if(type.size == 8) {
+ if (type.is_complex)
+ n *= 2;
+ if (type.is_float) {
+ if (type.size == 8) {
random_floats<double>(data, n, rnd_engine);
} else {
- random_floats<float> (data, n, rnd_engine);
+ random_floats<float>(data, n, rnd_engine);
}
} else {
- float int_max = float(uint64_t(2) << (type.size*8));
- if(type.is_signed) int_max /= 2.0;
+ float int_max = float(uint64_t(2) << (type.size * 8));
+ if (type.is_signed)
+ int_max /= 2.0;
std::uniform_real_distribution<float> uniform_dist(-int_max, int_max);
- for(unsigned int i=0; i<n; i++) {
+ for (unsigned int i = 0; i < n; i++) {
float scaled_rand = uniform_dist(rnd_engine);
- //man i really don't know how to do this in a more clever way, you have to cast down at some point
- switch(type.size) {
+ // man i really don't know how to do this in a more clever way, you have to
+ // cast down at some point
+ switch (type.size) {
case 8:
- if(type.is_signed) ((int64_t *)data)[i] = (int64_t) scaled_rand;
- else ((uint64_t *)data)[i] = (uint64_t) scaled_rand;
- break;
+ if (type.is_signed)
+ ((int64_t*)data)[i] = (int64_t)scaled_rand;
+ else
+ ((uint64_t*)data)[i] = (uint64_t)scaled_rand;
+ break;
case 4:
- if(type.is_signed) ((int32_t *)data)[i] = (int32_t) scaled_rand;
- else ((uint32_t *)data)[i] = (uint32_t) scaled_rand;
- break;
+ if (type.is_signed)
+ ((int32_t*)data)[i] = (int32_t)scaled_rand;
+ else
+ ((uint32_t*)data)[i] = (uint32_t)scaled_rand;
+ break;
case 2:
- if(type.is_signed) ((int16_t *)data)[i] = (int16_t)((int16_t) scaled_rand % 8);
- else ((uint16_t *)data)[i] = (uint16_t) ((int16_t) scaled_rand % 8);
- break;
+ if (type.is_signed)
+ ((int16_t*)data)[i] = (int16_t)((int16_t)scaled_rand % 8);
+ else
+ ((uint16_t*)data)[i] = (uint16_t)((int16_t)scaled_rand % 8);
+ break;
case 1:
- if(type.is_signed) ((int8_t *)data)[i] = (int8_t) scaled_rand;
- else ((uint8_t *)data)[i] = (uint8_t) scaled_rand;
- break;
+ if (type.is_signed)
+ ((int8_t*)data)[i] = (int8_t)scaled_rand;
+ else
+ ((uint8_t*)data)[i] = (uint8_t)scaled_rand;
+ break;
default:
- throw "load_random_data: no support for data size > 8 or < 1"; //no shenanigans here
+ throw "load_random_data: no support for data size > 8 or < 1"; // no
+ // shenanigans
+ // here
}
}
}
}
-static std::vector<std::string> get_arch_list(volk_func_desc_t desc) {
+static std::vector<std::string> get_arch_list(volk_func_desc_t desc)
+{
std::vector<std::string> archlist;
- for(size_t i = 0; i < desc.n_impls; i++) {
+ for (size_t i = 0; i < desc.n_impls; i++) {
archlist.push_back(std::string(desc.impl_names[i]));
}
return var;
}
-volk_type_t volk_type_from_string(std::string name) {
+volk_type_t volk_type_from_string(std::string name)
+{
volk_type_t type;
type.is_float = false;
type.is_scalar = false;
type.size = 0;
type.str = name;
- if(name.size() < 2) {
+ if (name.size() < 2) {
throw std::string("name too short to be a datatype");
}
- //is it a scalar?
- if(name[0] == 's') {
+ // is it a scalar?
+ if (name[0] == 's') {
type.is_scalar = true;
- name = name.substr(1, name.size()-1);
+ name = name.substr(1, name.size() - 1);
}
- //get the data size
+ // get the data size
size_t last_size_pos = name.find_last_of("0123456789");
- if(last_size_pos == std::string::npos) {
+ if (last_size_pos == std::string::npos) {
throw std::string("no size spec in type ").append(name);
}
- //will throw if malformed
- int size = volk_lexical_cast<int>(name.substr(0, last_size_pos+1));
+ // will throw if malformed
+ int size = volk_lexical_cast<int>(name.substr(0, last_size_pos + 1));
assert(((size % 8) == 0) && (size <= 64) && (size != 0));
- type.size = size/8; //in bytes
+ type.size = size / 8; // in bytes
- for(size_t i=last_size_pos+1; i < name.size(); i++) {
+ for (size_t i = last_size_pos + 1; i < name.size(); i++) {
switch (name[i]) {
case 'f':
type.is_float = true;
return type;
}
-std::vector<std::string> split_signature(const std::string &protokernel_signature) {
+std::vector<std::string> split_signature(const std::string& protokernel_signature)
+{
std::vector<std::string> signature_tokens;
std::string token;
for (unsigned int loc = 0; loc < protokernel_signature.size(); ++loc) {
return signature_tokens;
}
-static void get_signatures_from_name(std::vector<volk_type_t> &inputsig,
- std::vector<volk_type_t> &outputsig,
- std::string name) {
+static void get_signatures_from_name(std::vector<volk_type_t>& inputsig,
+ std::vector<volk_type_t>& outputsig,
+ std::string name)
+{
std::vector<std::string> toked = split_signature(name);
assert(toked[0] == "volk");
toked.erase(toked.begin());
- //ok. we're assuming a string in the form
+ // ok. we're assuming a string in the form
//(sig)_(multiplier-opt)_..._(name)_(sig)_(multiplier-opt)_..._(alignment)
enum { SIDE_INPUT, SIDE_NAME, SIDE_OUTPUT } side = SIDE_INPUT;
std::string token = toked[token_index];
try {
type = volk_type_from_string(token);
- if(side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name...
-
- if(side == SIDE_INPUT) inputsig.push_back(type);
- else outputsig.push_back(type);
- } catch (...){
- if(token[0] == 'x' && (token.size() > 1) && (token[1] > '0' && token[1] < '9')) { //it's a multiplier
- if(side == SIDE_INPUT) assert(inputsig.size() > 0);
- else assert(outputsig.size() > 0);
- int multiplier = volk_lexical_cast<int>(token.substr(1, token.size()-1)); //will throw if invalid
- for(int i=1; i<multiplier; i++) {
- if(side == SIDE_INPUT) inputsig.push_back(inputsig.back());
- else outputsig.push_back(outputsig.back());
+ if (side == SIDE_NAME)
+ side = SIDE_OUTPUT; // if this is the first one after the name...
+
+ if (side == SIDE_INPUT)
+ inputsig.push_back(type);
+ else
+ outputsig.push_back(type);
+ } catch (...) {
+ if (token[0] == 'x' && (token.size() > 1) &&
+ (token[1] > '0' && token[1] < '9')) { // it's a multiplier
+ if (side == SIDE_INPUT)
+ assert(inputsig.size() > 0);
+ else
+ assert(outputsig.size() > 0);
+ int multiplier = volk_lexical_cast<int>(
+ token.substr(1, token.size() - 1)); // will throw if invalid
+ for (int i = 1; i < multiplier; i++) {
+ if (side == SIDE_INPUT)
+ inputsig.push_back(inputsig.back());
+ else
+ outputsig.push_back(outputsig.back());
}
- }
- else if(side == SIDE_INPUT) { //it's the function name, at least it better be
+ } else if (side ==
+ SIDE_INPUT) { // it's the function name, at least it better be
side = SIDE_NAME;
fn_name.append("_");
fn_name.append(token);
- }
- else if(side == SIDE_OUTPUT) {
- if(token != toked.back()) throw; //the last token in the name is the alignment
+ } else if (side == SIDE_OUTPUT) {
+ if (token != toked.back())
+ throw; // the last token in the name is the alignment
}
}
}
- //we don't need an output signature (some fn's operate on the input data, "in place"), but we do need at least one input!
+ // we don't need an output signature (some fn's operate on the input data, "in
+ // place"), but we do need at least one input!
assert(inputsig.size() != 0);
-
}
-inline void run_cast_test1(volk_fn_1arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
- while(iter--) func(buffs[0], vlen, arch.c_str());
+inline void run_cast_test1(volk_fn_1arg func,
+ std::vector<void*>& buffs,
+ unsigned int vlen,
+ unsigned int iter,
+ std::string arch)
+{
+ while (iter--)
+ func(buffs[0], vlen, arch.c_str());
}
-inline void run_cast_test2(volk_fn_2arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
- while(iter--) func(buffs[0], buffs[1], vlen, arch.c_str());
+inline void run_cast_test2(volk_fn_2arg func,
+ std::vector<void*>& buffs,
+ unsigned int vlen,
+ unsigned int iter,
+ std::string arch)
+{
+ while (iter--)
+ func(buffs[0], buffs[1], vlen, arch.c_str());
}
-inline void run_cast_test3(volk_fn_3arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
- while(iter--) func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str());
+inline void run_cast_test3(volk_fn_3arg func,
+ std::vector<void*>& buffs,
+ unsigned int vlen,
+ unsigned int iter,
+ std::string arch)
+{
+ while (iter--)
+ func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str());
}
-inline void run_cast_test4(volk_fn_4arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
- while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str());
+inline void run_cast_test4(volk_fn_4arg func,
+ std::vector<void*>& buffs,
+ unsigned int vlen,
+ unsigned int iter,
+ std::string arch)
+{
+ while (iter--)
+ func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str());
}
-inline void run_cast_test1_s32f(volk_fn_1arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
- while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
+inline void run_cast_test1_s32f(volk_fn_1arg_s32f func,
+ std::vector<void*>& buffs,
+ float scalar,
+ unsigned int vlen,
+ unsigned int iter,
+ std::string arch)
+{
+ while (iter--)
+ func(buffs[0], scalar, vlen, arch.c_str());
}
-inline void run_cast_test2_s32f(volk_fn_2arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
- while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
+inline void run_cast_test2_s32f(volk_fn_2arg_s32f func,
+ std::vector<void*>& buffs,
+ float scalar,
+ unsigned int vlen,
+ unsigned int iter,
+ std::string arch)
+{
+ while (iter--)
+ func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
}
-inline void run_cast_test3_s32f(volk_fn_3arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
- while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+inline void run_cast_test3_s32f(volk_fn_3arg_s32f func,
+ std::vector<void*>& buffs,
+ float scalar,
+ unsigned int vlen,
+ unsigned int iter,
+ std::string arch)
+{
+ while (iter--)
+ func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
}
-inline void run_cast_test1_s32fc(volk_fn_1arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
- while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
+inline void run_cast_test1_s32fc(volk_fn_1arg_s32fc func,
+ std::vector<void*>& buffs,
+ lv_32fc_t scalar,
+ unsigned int vlen,
+ unsigned int iter,
+ std::string arch)
+{
+ while (iter--)
+ func(buffs[0], scalar, vlen, arch.c_str());
}
-inline void run_cast_test2_s32fc(volk_fn_2arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
- while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
+inline void run_cast_test2_s32fc(volk_fn_2arg_s32fc func,
+ std::vector<void*>& buffs,
+ lv_32fc_t scalar,
+ unsigned int vlen,
+ unsigned int iter,
+ std::string arch)
+{
+ while (iter--)
+ func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
}
-inline void run_cast_test3_s32fc(volk_fn_3arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
- while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+inline void run_cast_test3_s32fc(volk_fn_3arg_s32fc func,
+ std::vector<void*>& buffs,
+ lv_32fc_t scalar,
+ unsigned int vlen,
+ unsigned int iter,
+ std::string arch)
+{
+ while (iter--)
+ func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
}
template <class t>
-bool fcompare(t *in1, t *in2, unsigned int vlen, float tol, bool absolute_mode) {
+bool fcompare(t* in1, t* in2, unsigned int vlen, float tol, bool absolute_mode)
+{
bool fail = false;
int print_max_errs = 10;
- for(unsigned int i=0; i<vlen; i++) {
+ for (unsigned int i = 0; i < vlen; i++) {
if (absolute_mode) {
- if (fabs(((t *)(in1))[i] - ((t *)(in2))[i]) > tol) {
- fail=true;
- if(print_max_errs-- > 0) {
- std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]);
+ if (fabs(((t*)(in1))[i] - ((t*)(in2))[i]) > tol) {
+ fail = true;
+ if (print_max_errs-- > 0) {
+ std::cout << "offset " << i << " in1: " << t(((t*)(in1))[i])
+ << " in2: " << t(((t*)(in2))[i]);
std::cout << " tolerance was: " << tol << std::endl;
}
}
} else {
// for very small numbers we'll see round off errors due to limited
// precision. So a special test case...
- if(fabs(((t *)(in1))[i]) < 1e-30) {
- if( fabs( ((t *)(in2))[i] ) > tol )
- {
- fail=true;
- if(print_max_errs-- > 0) {
- std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]);
+ if (fabs(((t*)(in1))[i]) < 1e-30) {
+ if (fabs(((t*)(in2))[i]) > tol) {
+ fail = true;
+ if (print_max_errs-- > 0) {
+ std::cout << "offset " << i << " in1: " << t(((t*)(in1))[i])
+ << " in2: " << t(((t*)(in2))[i]);
std::cout << " tolerance was: " << tol << std::endl;
}
}
}
// the primary test is the percent different greater than given tol
- else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/fabs(((t *)in1)[i]) > tol) {
- fail=true;
- if(print_max_errs-- > 0) {
- std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]);
+ else if (fabs(((t*)(in1))[i] - ((t*)(in2))[i]) / fabs(((t*)in1)[i]) > tol) {
+ fail = true;
+ if (print_max_errs-- > 0) {
+ std::cout << "offset " << i << " in1: " << t(((t*)(in1))[i])
+ << " in2: " << t(((t*)(in2))[i]);
std::cout << " tolerance was: " << tol << std::endl;
}
}
}
template <class t>
-bool ccompare(t *in1, t *in2, unsigned int vlen, float tol, bool absolute_mode) {
+bool ccompare(t* in1, t* in2, unsigned int vlen, float tol, bool absolute_mode)
+{
if (absolute_mode) {
- std::cout << "ccompare does not support absolute mode" << std::endl;
- return true;
+ std::cout << "ccompare does not support absolute mode" << std::endl;
+ return true;
}
bool fail = false;
int print_max_errs = 10;
- for(unsigned int i=0; i<2*vlen; i+=2) {
- if (std::isnan(in1[i]) || std::isnan(in1[i+1]) || std::isnan(in2[i]) || std::isnan(in2[i+1])
- || std::isinf(in1[i]) || std::isinf(in1[i+1]) || std::isinf(in2[i]) || std::isinf(in2[i+1])) {
- fail=true;
- if(print_max_errs-- > 0) {
- std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j in2: " << in2[i] << " + " << in2[i+1] << "j";
+ for (unsigned int i = 0; i < 2 * vlen; i += 2) {
+ if (std::isnan(in1[i]) || std::isnan(in1[i + 1]) || std::isnan(in2[i]) ||
+ std::isnan(in2[i + 1]) || std::isinf(in1[i]) || std::isinf(in1[i + 1]) ||
+ std::isinf(in2[i]) || std::isinf(in2[i + 1])) {
+ fail = true;
+ if (print_max_errs-- > 0) {
+ std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + "
+ << in1[i + 1] << "j in2: " << in2[i] << " + " << in2[i + 1]
+ << "j";
std::cout << " tolerance was: " << tol << std::endl;
}
}
- t diff[2] = { in1[i] - in2[i], in1[i+1] - in2[i+1] };
- t err = std::sqrt(diff[0] * diff[0] + diff[1] * diff[1]);
- t norm = std::sqrt(in1[i] * in1[i] + in1[i+1] * in1[i+1]);
+ t diff[2] = { in1[i] - in2[i], in1[i + 1] - in2[i + 1] };
+ t err = std::sqrt(diff[0] * diff[0] + diff[1] * diff[1]);
+ t norm = std::sqrt(in1[i] * in1[i] + in1[i + 1] * in1[i + 1]);
// for very small numbers we'll see round off errors due to limited
// precision. So a special test case...
if (norm < 1e-30) {
- if (err > tol)
- {
- fail=true;
- if(print_max_errs-- > 0) {
- std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j in2: " << in2[i] << " + " << in2[i+1] << "j";
+ if (err > tol) {
+ fail = true;
+ if (print_max_errs-- > 0) {
+ std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + "
+ << in1[i + 1] << "j in2: " << in2[i] << " + " << in2[i + 1]
+ << "j";
std::cout << " tolerance was: " << tol << std::endl;
}
}
}
// the primary test is the percent different greater than given tol
- else if((err / norm) > tol) {
- fail=true;
- if(print_max_errs-- > 0) {
- std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j in2: " << in2[i] << " + " << in2[i+1] << "j";
+ else if ((err / norm) > tol) {
+ fail = true;
+ if (print_max_errs-- > 0) {
+ std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + "
+ << in1[i + 1] << "j in2: " << in2[i] << " + " << in2[i + 1]
+ << "j";
std::cout << " tolerance was: " << tol << std::endl;
}
}
}
template <class t>
-bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol, bool absolute_mode) {
+bool icompare(t* in1, t* in2, unsigned int vlen, unsigned int tol, bool absolute_mode)
+{
if (absolute_mode) {
- std::cout << "icompare does not support absolute mode" << std::endl;
- return true;
+ std::cout << "icompare does not support absolute mode" << std::endl;
+ return true;
}
bool fail = false;
int print_max_errs = 10;
- for(unsigned int i=0; i<vlen; i++) {
- if(((unsigned int)abs(int(((t *)(in1))[i]) - int(((t *)(in2))[i]))) > tol) {
- fail=true;
- if(print_max_errs-- > 0) {
- std::cout << "offset " << i << " in1: " << static_cast<int>(t(((t *)(in1))[i])) << " in2: " << static_cast<int>(t(((t *)(in2))[i]));
+ for (unsigned int i = 0; i < vlen; i++) {
+ if (((unsigned int)abs(int(((t*)(in1))[i]) - int(((t*)(in2))[i]))) > tol) {
+ fail = true;
+ if (print_max_errs-- > 0) {
+ std::cout << "offset " << i
+ << " in1: " << static_cast<int>(t(((t*)(in1))[i]))
+ << " in2: " << static_cast<int>(t(((t*)(in2))[i]));
std::cout << " tolerance was: " << tol << std::endl;
}
}
return fail;
}
-class volk_qa_aligned_mem_pool{
+class volk_qa_aligned_mem_pool
+{
public:
- void *get_new(size_t size){
+ void* get_new(size_t size)
+ {
size_t alignment = volk_get_alignment();
void* ptr = volk_malloc(size, alignment);
memset(ptr, 0x00, size);
_mems.push_back(ptr);
return ptr;
}
- ~volk_qa_aligned_mem_pool() {
- for(unsigned int ii = 0; ii < _mems.size(); ++ii) {
+ ~volk_qa_aligned_mem_pool()
+ {
+ for (unsigned int ii = 0; ii < _mems.size(); ++ii) {
volk_free(_mems[ii]);
}
}
-private: std::vector<void * > _mems;
+
+private:
+ std::vector<void*> _mems;
};
bool run_volk_tests(volk_func_desc_t desc,
void (*manual_func)(),
std::string name,
volk_test_params_t test_params,
- std::vector<volk_test_results_t> *results,
- std::string puppet_master_name
-)
+ std::vector<volk_test_results_t>* results,
+ std::string puppet_master_name)
{
- return run_volk_tests(desc, manual_func, name, test_params.tol(), test_params.scalar(),
- test_params.vlen(), test_params.iter(), results, puppet_master_name,
- test_params.absolute_mode(), test_params.benchmark_mode());
+ return run_volk_tests(desc,
+ manual_func,
+ name,
+ test_params.tol(),
+ test_params.scalar(),
+ test_params.vlen(),
+ test_params.iter(),
+ results,
+ puppet_master_name,
+ test_params.absolute_mode(),
+ test_params.benchmark_mode());
}
bool run_volk_tests(volk_func_desc_t desc,
lv_32fc_t scalar,
unsigned int vlen,
unsigned int iter,
- std::vector<volk_test_results_t> *results,
+ std::vector<volk_test_results_t>* results,
std::string puppet_master_name,
bool absolute_mode,
- bool benchmark_mode
-) {
+ bool benchmark_mode)
+{
// Initialize this entry in results vector
results->push_back(volk_test_results_t());
results->back().name = name;
results->back().vlen = vlen;
results->back().iter = iter;
- std::cout << "RUN_VOLK_TESTS: " << name << "(" << vlen << "," << iter << ")" << std::endl;
+ std::cout << "RUN_VOLK_TESTS: " << name << "(" << vlen << "," << iter << ")"
+ << std::endl;
// vlen_twiddle will increase vlen for malloc and data generation
// but kernels will still be called with the user provided vlen.
const float tol_f = tol;
const unsigned int tol_i = static_cast<const unsigned int>(tol);
- //first let's get a list of available architectures for the test
+ // first let's get a list of available architectures for the test
std::vector<std::string> arch_list = get_arch_list(desc);
- if((!benchmark_mode) && (arch_list.size() < 2)) {
+ if ((!benchmark_mode) && (arch_list.size() < 2)) {
std::cout << "no architectures to test" << std::endl;
return false;
}
- //something that can hang onto memory and cleanup when this function exits
+ // something that can hang onto memory and cleanup when this function exits
volk_qa_aligned_mem_pool mem_pool;
- //now we have to get a function signature by parsing the name
+ // now we have to get a function signature by parsing the name
std::vector<volk_type_t> inputsig, outputsig;
try {
get_signatures_from_name(inputsig, outputsig, name);
- }
- catch (std::exception &error) {
- std::cerr << "Error: unable to get function signature from kernel name" << std::endl;
+ } catch (std::exception& error) {
+ std::cerr << "Error: unable to get function signature from kernel name"
+ << std::endl;
std::cerr << " - " << name << std::endl;
return false;
}
- //pull the input scalars into their own vector
+ // pull the input scalars into their own vector
std::vector<volk_type_t> inputsc;
- for(size_t i=0; i<inputsig.size(); i++) {
- if(inputsig[i].is_scalar) {
+ for (size_t i = 0; i < inputsig.size(); i++) {
+ if (inputsig[i].is_scalar) {
inputsc.push_back(inputsig[i]);
inputsig.erase(inputsig.begin() + i);
i -= 1;
}
}
- std::vector<void *> inbuffs;
- for (unsigned int inputsig_index = 0; inputsig_index < inputsig.size(); ++ inputsig_index) {
+ std::vector<void*> inbuffs;
+ for (unsigned int inputsig_index = 0; inputsig_index < inputsig.size();
+ ++inputsig_index) {
volk_type_t sig = inputsig[inputsig_index];
- if(!sig.is_scalar) //we don't make buffers for scalars
- inbuffs.push_back(mem_pool.get_new(vlen*sig.size*(sig.is_complex ? 2 : 1)));
+ if (!sig.is_scalar) // we don't make buffers for scalars
+ inbuffs.push_back(
+ mem_pool.get_new(vlen * sig.size * (sig.is_complex ? 2 : 1)));
}
- for(size_t i=0; i<inbuffs.size(); i++) {
+ for (size_t i = 0; i < inbuffs.size(); i++) {
load_random_data(inbuffs[i], inputsig[i], vlen);
}
- //ok let's make a vector of vector of void buffers, which holds the input/output vectors for each arch
- std::vector<std::vector<void *> > test_data;
- for(size_t i=0; i<arch_list.size(); i++) {
- std::vector<void *> arch_buffs;
- for(size_t j=0; j<outputsig.size(); j++) {
- arch_buffs.push_back(mem_pool.get_new(vlen*outputsig[j].size*(outputsig[j].is_complex ? 2 : 1)));
+ // ok let's make a vector of vector of void buffers, which holds the input/output
+ // vectors for each arch
+ std::vector<std::vector<void*>> test_data;
+ for (size_t i = 0; i < arch_list.size(); i++) {
+ std::vector<void*> arch_buffs;
+ for (size_t j = 0; j < outputsig.size(); j++) {
+ arch_buffs.push_back(mem_pool.get_new(vlen * outputsig[j].size *
+ (outputsig[j].is_complex ? 2 : 1)));
}
- for(size_t j=0; j<inputsig.size(); j++) {
- void *arch_inbuff = mem_pool.get_new(vlen*inputsig[j].size*(inputsig[j].is_complex ? 2 : 1));
- memcpy(arch_inbuff, inbuffs[j], vlen * inputsig[j].size * (inputsig[j].is_complex ? 2 : 1));
+ for (size_t j = 0; j < inputsig.size(); j++) {
+ void* arch_inbuff = mem_pool.get_new(vlen * inputsig[j].size *
+ (inputsig[j].is_complex ? 2 : 1));
+ memcpy(arch_inbuff,
+ inbuffs[j],
+ vlen * inputsig[j].size * (inputsig[j].is_complex ? 2 : 1));
arch_buffs.push_back(arch_inbuff);
}
test_data.push_back(arch_buffs);
both_sigs.insert(both_sigs.end(), outputsig.begin(), outputsig.end());
both_sigs.insert(both_sigs.end(), inputsig.begin(), inputsig.end());
- //now run the test
+ // now run the test
vlen = vlen - vlen_twiddle;
std::chrono::time_point<std::chrono::system_clock> start, end;
std::vector<double> profile_times;
- for(size_t i = 0; i < arch_list.size(); i++) {
+ for (size_t i = 0; i < arch_list.size(); i++) {
start = std::chrono::system_clock::now();
- switch(both_sigs.size()) {
- case 1:
- if(inputsc.size() == 0) {
- run_cast_test1((volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
- } else if(inputsc.size() == 1 && inputsc[0].is_float) {
- if(inputsc[0].is_complex) {
- run_cast_test1_s32fc((volk_fn_1arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
- } else {
- run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
- }
- } else throw "unsupported 1 arg function >1 scalars";
- break;
- case 2:
- if(inputsc.size() == 0) {
- run_cast_test2((volk_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
- } else if(inputsc.size() == 1 && inputsc[0].is_float) {
- if(inputsc[0].is_complex) {
- run_cast_test2_s32fc((volk_fn_2arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
- } else {
- run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
- }
- } else throw "unsupported 2 arg function >1 scalars";
- break;
- case 3:
- if(inputsc.size() == 0) {
- run_cast_test3((volk_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
- } else if(inputsc.size() == 1 && inputsc[0].is_float) {
- if(inputsc[0].is_complex) {
- run_cast_test3_s32fc((volk_fn_3arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
- } else {
- run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
- }
- } else throw "unsupported 3 arg function >1 scalars";
- break;
- case 4:
- run_cast_test4((volk_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
- break;
- default:
- throw "no function handler for this signature";
- break;
+ switch (both_sigs.size()) {
+ case 1:
+ if (inputsc.size() == 0) {
+ run_cast_test1(
+ (volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ } else if (inputsc.size() == 1 && inputsc[0].is_float) {
+ if (inputsc[0].is_complex) {
+ run_cast_test1_s32fc((volk_fn_1arg_s32fc)(manual_func),
+ test_data[i],
+ scalar,
+ vlen,
+ iter,
+ arch_list[i]);
+ } else {
+ run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func),
+ test_data[i],
+ scalar.real(),
+ vlen,
+ iter,
+ arch_list[i]);
+ }
+ } else
+ throw "unsupported 1 arg function >1 scalars";
+ break;
+ case 2:
+ if (inputsc.size() == 0) {
+ run_cast_test2(
+ (volk_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ } else if (inputsc.size() == 1 && inputsc[0].is_float) {
+ if (inputsc[0].is_complex) {
+ run_cast_test2_s32fc((volk_fn_2arg_s32fc)(manual_func),
+ test_data[i],
+ scalar,
+ vlen,
+ iter,
+ arch_list[i]);
+ } else {
+ run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func),
+ test_data[i],
+ scalar.real(),
+ vlen,
+ iter,
+ arch_list[i]);
+ }
+ } else
+ throw "unsupported 2 arg function >1 scalars";
+ break;
+ case 3:
+ if (inputsc.size() == 0) {
+ run_cast_test3(
+ (volk_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ } else if (inputsc.size() == 1 && inputsc[0].is_float) {
+ if (inputsc[0].is_complex) {
+ run_cast_test3_s32fc((volk_fn_3arg_s32fc)(manual_func),
+ test_data[i],
+ scalar,
+ vlen,
+ iter,
+ arch_list[i]);
+ } else {
+ run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func),
+ test_data[i],
+ scalar.real(),
+ vlen,
+ iter,
+ arch_list[i]);
+ }
+ } else
+ throw "unsupported 3 arg function >1 scalars";
+ break;
+ case 4:
+ run_cast_test4(
+ (volk_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ break;
+ default:
+ throw "no function handler for this signature";
+ break;
}
end = std::chrono::system_clock::now();
profile_times.push_back(arch_time);
}
- //and now compare each output to the generic output
- //first we have to know which output is the generic one, they aren't in order...
- size_t generic_offset=0;
- for(size_t i=0; i<arch_list.size(); i++) {
+ // and now compare each output to the generic output
+ // first we have to know which output is the generic one, they aren't in order...
+ size_t generic_offset = 0;
+ for (size_t i = 0; i < arch_list.size(); i++) {
if (arch_list[i] == "generic") {
generic_offset = i;
}
bool fail;
bool fail_global = false;
std::vector<bool> arch_results;
- for(size_t i=0; i<arch_list.size(); i++) {
+ for (size_t i = 0; i < arch_list.size(); i++) {
fail = false;
- if(i != generic_offset) {
- for(size_t j=0; j<both_sigs.size(); j++) {
- if(both_sigs[j].is_float) {
- if(both_sigs[j].size == 8) {
+ if (i != generic_offset) {
+ for (size_t j = 0; j < both_sigs.size(); j++) {
+ if (both_sigs[j].is_float) {
+ if (both_sigs[j].size == 8) {
if (both_sigs[j].is_complex) {
- fail = ccompare((double *) test_data[generic_offset][j], (double *) test_data[i][j], vlen, tol_f, absolute_mode);
+ fail = ccompare((double*)test_data[generic_offset][j],
+ (double*)test_data[i][j],
+ vlen,
+ tol_f,
+ absolute_mode);
} else {
- fail = fcompare((double *) test_data[generic_offset][j], (double *) test_data[i][j], vlen, tol_f, absolute_mode);
+ fail = fcompare((double*)test_data[generic_offset][j],
+ (double*)test_data[i][j],
+ vlen,
+ tol_f,
+ absolute_mode);
}
} else {
if (both_sigs[j].is_complex) {
- fail = ccompare((float *) test_data[generic_offset][j], (float *) test_data[i][j], vlen, tol_f, absolute_mode);
+ fail = ccompare((float*)test_data[generic_offset][j],
+ (float*)test_data[i][j],
+ vlen,
+ tol_f,
+ absolute_mode);
} else {
- fail = fcompare((float *) test_data[generic_offset][j], (float *) test_data[i][j], vlen, tol_f, absolute_mode);
+ fail = fcompare((float*)test_data[generic_offset][j],
+ (float*)test_data[i][j],
+ vlen,
+ tol_f,
+ absolute_mode);
}
}
} else {
- //i could replace this whole switch statement with a memcmp if i wasn't interested in printing the outputs where they differ
- switch(both_sigs[j].size) {
+ // i could replace this whole switch statement with a memcmp if i
+ // wasn't interested in printing the outputs where they differ
+ switch (both_sigs[j].size) {
case 8:
- if(both_sigs[j].is_signed) {
- fail = icompare((int64_t *) test_data[generic_offset][j], (int64_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
+ if (both_sigs[j].is_signed) {
+ fail = icompare((int64_t*)test_data[generic_offset][j],
+ (int64_t*)test_data[i][j],
+ vlen * (both_sigs[j].is_complex ? 2 : 1),
+ tol_i,
+ absolute_mode);
} else {
- fail = icompare((uint64_t *) test_data[generic_offset][j], (uint64_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
+ fail = icompare((uint64_t*)test_data[generic_offset][j],
+ (uint64_t*)test_data[i][j],
+ vlen * (both_sigs[j].is_complex ? 2 : 1),
+ tol_i,
+ absolute_mode);
}
break;
case 4:
- if(both_sigs[j].is_complex) {
- if(both_sigs[j].is_signed) {
- fail = icompare((int16_t *) test_data[generic_offset][j], (int16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
+ if (both_sigs[j].is_complex) {
+ if (both_sigs[j].is_signed) {
+ fail = icompare((int16_t*)test_data[generic_offset][j],
+ (int16_t*)test_data[i][j],
+ vlen * (both_sigs[j].is_complex ? 2 : 1),
+ tol_i,
+ absolute_mode);
} else {
- fail = icompare((uint16_t *) test_data[generic_offset][j], (uint16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
+ fail = icompare((uint16_t*)test_data[generic_offset][j],
+ (uint16_t*)test_data[i][j],
+ vlen * (both_sigs[j].is_complex ? 2 : 1),
+ tol_i,
+ absolute_mode);
}
- }
- else {
+ } else {
if (both_sigs[j].is_signed) {
- fail = icompare((int32_t *) test_data[generic_offset][j], (int32_t *) test_data[i][j],
- vlen * (both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
+ fail = icompare((int32_t*)test_data[generic_offset][j],
+ (int32_t*)test_data[i][j],
+ vlen * (both_sigs[j].is_complex ? 2 : 1),
+ tol_i,
+ absolute_mode);
} else {
- fail = icompare((uint32_t *) test_data[generic_offset][j], (uint32_t *) test_data[i][j],
- vlen * (both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
+ fail = icompare((uint32_t*)test_data[generic_offset][j],
+ (uint32_t*)test_data[i][j],
+ vlen * (both_sigs[j].is_complex ? 2 : 1),
+ tol_i,
+ absolute_mode);
}
}
break;
case 2:
- if(both_sigs[j].is_signed) {
- fail = icompare((int16_t *) test_data[generic_offset][j], (int16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
+ if (both_sigs[j].is_signed) {
+ fail = icompare((int16_t*)test_data[generic_offset][j],
+ (int16_t*)test_data[i][j],
+ vlen * (both_sigs[j].is_complex ? 2 : 1),
+ tol_i,
+ absolute_mode);
} else {
- fail = icompare((uint16_t *) test_data[generic_offset][j], (uint16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
+ fail = icompare((uint16_t*)test_data[generic_offset][j],
+ (uint16_t*)test_data[i][j],
+ vlen * (both_sigs[j].is_complex ? 2 : 1),
+ tol_i,
+ absolute_mode);
}
break;
case 1:
- if(both_sigs[j].is_signed) {
- fail = icompare((int8_t *) test_data[generic_offset][j], (int8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
+ if (both_sigs[j].is_signed) {
+ fail = icompare((int8_t*)test_data[generic_offset][j],
+ (int8_t*)test_data[i][j],
+ vlen * (both_sigs[j].is_complex ? 2 : 1),
+ tol_i,
+ absolute_mode);
} else {
- fail = icompare((uint8_t *) test_data[generic_offset][j], (uint8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
+ fail = icompare((uint8_t*)test_data[generic_offset][j],
+ (uint8_t*)test_data[i][j],
+ vlen * (both_sigs[j].is_complex ? 2 : 1),
+ tol_i,
+ absolute_mode);
}
break;
default:
- fail=1;
+ fail = 1;
}
}
- if(fail) {
- volk_test_time_t *result = &results->back().results[arch_list[i]];
+ if (fail) {
+ volk_test_time_t* result = &results->back().results[arch_list[i]];
result->pass = false;
fail_global = true;
std::cout << name << ": fail on arch " << arch_list[i] << std::endl;
double best_time_u = std::numeric_limits<double>::max();
std::string best_arch_a = "generic";
std::string best_arch_u = "generic";
- for(size_t i=0; i < arch_list.size(); i++)
- {
- if((profile_times[i] < best_time_u) && arch_results[i] && desc.impl_alignment[i] == 0)
- {
+ for (size_t i = 0; i < arch_list.size(); i++) {
+ if ((profile_times[i] < best_time_u) && arch_results[i] &&
+ desc.impl_alignment[i] == 0) {
best_time_u = profile_times[i];
best_arch_u = arch_list[i];
}
- if((profile_times[i] < best_time_a) && arch_results[i])
- {
+ if ((profile_times[i] < best_time_a) && arch_results[i]) {
best_time_a = profile_times[i];
best_arch_a = arch_list[i];
}
std::cout << "Best aligned arch: " << best_arch_a << std::endl;
std::cout << "Best unaligned arch: " << best_arch_u << std::endl;
- if(puppet_master_name == "NULL") {
+ if (puppet_master_name == "NULL") {
results->back().config_name = name;
} else {
results->back().config_name = puppet_master_name;
#ifndef VOLK_QA_UTILS_H
#define VOLK_QA_UTILS_H
-#include <stdbool.h> // for bool, false
-#include <volk/volk.h> // for volk_func_desc_t
-#include <cstdlib> // for NULL
-#include <map> // for map
-#include <string> // for string, basic_string
-#include <vector> // for vector
+#include <stdbool.h> // for bool, false
+#include <volk/volk.h> // for volk_func_desc_t
+#include <cstdlib> // for NULL
+#include <map> // for map
+#include <string> // for string, basic_string
+#include <vector> // for vector
-#include "volk/volk_complex.h" // for lv_32fc_t
+#include "volk/volk_complex.h" // for lv_32fc_t
/************************************************
* VOLK QA type definitions *
std::string str;
};
-class volk_test_time_t {
- public:
- std::string name;
- double time;
- std::string units;
- bool pass;
+class volk_test_time_t
+{
+public:
+ std::string name;
+ double time;
+ std::string units;
+ bool pass;
};
-class volk_test_results_t {
- public:
- std::string name;
- std::string config_name;
- unsigned int vlen;
- unsigned int iter;
- std::map<std::string, volk_test_time_t> results;
- std::string best_arch_a;
- std::string best_arch_u;
+class volk_test_results_t
+{
+public:
+ std::string name;
+ std::string config_name;
+ unsigned int vlen;
+ unsigned int iter;
+ std::map<std::string, volk_test_time_t> results;
+ std::string best_arch_a;
+ std::string best_arch_u;
};
-class volk_test_params_t {
- private:
- float _tol;
- lv_32fc_t _scalar;
- unsigned int _vlen;
- unsigned int _iter;
- bool _benchmark_mode;
- bool _absolute_mode;
- std::string _kernel_regex;
- public:
- // ctor
- volk_test_params_t(float tol, lv_32fc_t scalar, unsigned int vlen, unsigned int iter,
- bool benchmark_mode, std::string kernel_regex) :
- _tol(tol), _scalar(scalar), _vlen(vlen), _iter(iter),
- _benchmark_mode(benchmark_mode), _absolute_mode(false), _kernel_regex(kernel_regex) {};
- // setters
- void set_tol(float tol) {_tol=tol;};
- void set_scalar(lv_32fc_t scalar) {_scalar=scalar;};
- void set_vlen(unsigned int vlen) {_vlen=vlen;};
- void set_iter(unsigned int iter) {_iter=iter;};
- void set_benchmark(bool benchmark) {_benchmark_mode=benchmark;};
- void set_regex(std::string regex) {_kernel_regex=regex;};
- // getters
- float tol() {return _tol;};
- lv_32fc_t scalar() {return _scalar;};
- unsigned int vlen() {return _vlen;};
- unsigned int iter() {return _iter;};
- bool benchmark_mode() {return _benchmark_mode;};
- bool absolute_mode() {return _absolute_mode;};
- std::string kernel_regex() {return _kernel_regex;};
- volk_test_params_t make_absolute(float tol) {
- volk_test_params_t t(*this);
- t._tol = tol;
- t._absolute_mode = true;
- return t;
- }
- volk_test_params_t make_tol(float tol) {
- volk_test_params_t t(*this);
- t._tol = tol;
- return t;
- }
+class volk_test_params_t
+{
+private:
+ float _tol;
+ lv_32fc_t _scalar;
+ unsigned int _vlen;
+ unsigned int _iter;
+ bool _benchmark_mode;
+ bool _absolute_mode;
+ std::string _kernel_regex;
+
+public:
+ // ctor
+ volk_test_params_t(float tol,
+ lv_32fc_t scalar,
+ unsigned int vlen,
+ unsigned int iter,
+ bool benchmark_mode,
+ std::string kernel_regex)
+ : _tol(tol),
+ _scalar(scalar),
+ _vlen(vlen),
+ _iter(iter),
+ _benchmark_mode(benchmark_mode),
+ _absolute_mode(false),
+ _kernel_regex(kernel_regex){};
+ // setters
+ void set_tol(float tol) { _tol = tol; };
+ void set_scalar(lv_32fc_t scalar) { _scalar = scalar; };
+ void set_vlen(unsigned int vlen) { _vlen = vlen; };
+ void set_iter(unsigned int iter) { _iter = iter; };
+ void set_benchmark(bool benchmark) { _benchmark_mode = benchmark; };
+ void set_regex(std::string regex) { _kernel_regex = regex; };
+ // getters
+ float tol() { return _tol; };
+ lv_32fc_t scalar() { return _scalar; };
+ unsigned int vlen() { return _vlen; };
+ unsigned int iter() { return _iter; };
+ bool benchmark_mode() { return _benchmark_mode; };
+ bool absolute_mode() { return _absolute_mode; };
+ std::string kernel_regex() { return _kernel_regex; };
+ volk_test_params_t make_absolute(float tol)
+ {
+ volk_test_params_t t(*this);
+ t._tol = tol;
+ t._absolute_mode = true;
+ return t;
+ }
+ volk_test_params_t make_tol(float tol)
+ {
+ volk_test_params_t t(*this);
+ t._tol = tol;
+ return t;
+ }
};
-class volk_test_case_t {
- private:
- volk_func_desc_t _desc;
- void(*_kernel_ptr)();
- std::string _name;
- volk_test_params_t _test_parameters;
- std::string _puppet_master_name;
- public:
- volk_func_desc_t desc() {return _desc;};
- void (*kernel_ptr()) () {return _kernel_ptr;};
- std::string name() {return _name;};
- std::string puppet_master_name() {return _puppet_master_name;};
- volk_test_params_t test_parameters() {return _test_parameters;};
- // normal ctor
- volk_test_case_t(volk_func_desc_t desc, void(*kernel_ptr)(), std::string name,
- volk_test_params_t test_parameters) :
- _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters),
- _puppet_master_name("NULL")
- {};
- // ctor for puppets
- volk_test_case_t(volk_func_desc_t desc, void(*kernel_ptr)(), std::string name,
- std::string puppet_master_name, volk_test_params_t test_parameters) :
- _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters),
- _puppet_master_name(puppet_master_name)
- {};
+class volk_test_case_t
+{
+private:
+ volk_func_desc_t _desc;
+ void (*_kernel_ptr)();
+ std::string _name;
+ volk_test_params_t _test_parameters;
+ std::string _puppet_master_name;
+
+public:
+ volk_func_desc_t desc() { return _desc; };
+ void (*kernel_ptr())() { return _kernel_ptr; };
+ std::string name() { return _name; };
+ std::string puppet_master_name() { return _puppet_master_name; };
+ volk_test_params_t test_parameters() { return _test_parameters; };
+ // normal ctor
+ volk_test_case_t(volk_func_desc_t desc,
+ void (*kernel_ptr)(),
+ std::string name,
+ volk_test_params_t test_parameters)
+ : _desc(desc),
+ _kernel_ptr(kernel_ptr),
+ _name(name),
+ _test_parameters(test_parameters),
+ _puppet_master_name("NULL"){};
+ // ctor for puppets
+ volk_test_case_t(volk_func_desc_t desc,
+ void (*kernel_ptr)(),
+ std::string name,
+ std::string puppet_master_name,
+ volk_test_params_t test_parameters)
+ : _desc(desc),
+ _kernel_ptr(kernel_ptr),
+ _name(name),
+ _test_parameters(test_parameters),
+ _puppet_master_name(puppet_master_name){};
};
/************************************************
volk_type_t volk_type_from_string(std::string);
float uniform(void);
-void random_floats(float *buf, unsigned n);
+void random_floats(float* buf, unsigned n);
-bool run_volk_tests(
- volk_func_desc_t,
- void(*)(),
- std::string,
- volk_test_params_t,
- std::vector<volk_test_results_t> *results = NULL,
- std::string puppet_master_name = "NULL"
- );
+bool run_volk_tests(volk_func_desc_t,
+ void (*)(),
+ std::string,
+ volk_test_params_t,
+ std::vector<volk_test_results_t>* results = NULL,
+ std::string puppet_master_name = "NULL");
-bool run_volk_tests(
- volk_func_desc_t,
- void(*)(),
- std::string,
- float,
- lv_32fc_t,
- unsigned int,
- unsigned int,
- std::vector<volk_test_results_t> *results = NULL,
- std::string puppet_master_name = "NULL",
- bool absolute_mode = false,
- bool benchmark_mode = false
-);
+bool run_volk_tests(volk_func_desc_t,
+ void (*)(),
+ std::string,
+ float,
+ lv_32fc_t,
+ unsigned int,
+ unsigned int,
+ std::vector<volk_test_results_t>* results = NULL,
+ std::string puppet_master_name = "NULL",
+ bool absolute_mode = false,
+ bool benchmark_mode = false);
-#define VOLK_PROFILE(func, test_params, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, "NULL")
-#define VOLK_PUPPET_PROFILE(func, puppet_master_func, test_params, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, std::string(#puppet_master_func))
-typedef void (*volk_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place
-typedef void (*volk_fn_2arg)(void *, void *, unsigned int, const char*);
-typedef void (*volk_fn_3arg)(void *, void *, void *, unsigned int, const char*);
-typedef void (*volk_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*);
-typedef void (*volk_fn_1arg_s32f)(void *, float, unsigned int, const char*); //one input vector, one scalar float input
-typedef void (*volk_fn_2arg_s32f)(void *, void *, float, unsigned int, const char*);
-typedef void (*volk_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char*);
-typedef void (*volk_fn_1arg_s32fc)(void *, lv_32fc_t, unsigned int, const char*); //one input vector, one scalar float input
-typedef void (*volk_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*);
-typedef void (*volk_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*);
+#define VOLK_PROFILE(func, test_params, results) \
+ run_volk_tests(func##_get_func_desc(), \
+ (void (*)())func##_manual, \
+ std::string(#func), \
+ test_params, \
+ results, \
+ "NULL")
+#define VOLK_PUPPET_PROFILE(func, puppet_master_func, test_params, results) \
+ run_volk_tests(func##_get_func_desc(), \
+ (void (*)())func##_manual, \
+ std::string(#func), \
+ test_params, \
+ results, \
+ std::string(#puppet_master_func))
+typedef void (*volk_fn_1arg)(void*,
+ unsigned int,
+ const char*); // one input, operate in place
+typedef void (*volk_fn_2arg)(void*, void*, unsigned int, const char*);
+typedef void (*volk_fn_3arg)(void*, void*, void*, unsigned int, const char*);
+typedef void (*volk_fn_4arg)(void*, void*, void*, void*, unsigned int, const char*);
+typedef void (*volk_fn_1arg_s32f)(
+ void*, float, unsigned int, const char*); // one input vector, one scalar float input
+typedef void (*volk_fn_2arg_s32f)(void*, void*, float, unsigned int, const char*);
+typedef void (*volk_fn_3arg_s32f)(void*, void*, void*, float, unsigned int, const char*);
+typedef void (*volk_fn_1arg_s32fc)(
+ void*,
+ lv_32fc_t,
+ unsigned int,
+ const char*); // one input vector, one scalar float input
+typedef void (*volk_fn_2arg_s32fc)(void*, void*, lv_32fc_t, unsigned int, const char*);
+typedef void (*volk_fn_3arg_s32fc)(
+ void*, void*, void*, lv_32fc_t, unsigned int, const char*);
-#endif //VOLK_QA_UTILS_H
+#endif // VOLK_QA_UTILS_H
* Boston, MA 02110-1301, USA.
*/
-#include <stdbool.h> // for bool, false, true
-#include <iostream> // for operator<<, basic_ostream, endl, char...
-#include <fstream> // IWYU pragma: keep
-#include <map> // for map, map<>::iterator, _Rb_tree_iterator
-#include <string> // for string, operator<<
-#include <utility> // for pair
-#include <vector> // for vector
-
+#include <stdbool.h> // for bool, false, true
+#include <fstream> // IWYU pragma: keep
+#include <iostream> // for operator<<, basic_ostream, endl, char...
+#include <map> // for map, map<>::iterator, _Rb_tree_iterator
+#include <string> // for string, operator<<
+#include <utility> // for pair
+#include <vector> // for vector
+
+#include "kernel_tests.h" // for init_test_list
+#include "qa_utils.h" // for volk_test_case_t, volk_test_results_t
+#include "volk/volk_complex.h" // for lv_32fc_t
#include <volk/volk.h>
-#include "kernel_tests.h" // for init_test_list
-#include "qa_utils.h" // for volk_test_case_t, volk_test_results_t
-#include "volk/volk_complex.h" // for lv_32fc_t
void print_qa_xml(std::vector<volk_test_results_t> results, unsigned int nfails);
bool def_benchmark_mode = true;
std::string def_kernel_regex = "";
- volk_test_params_t test_params(def_tol, def_scalar, def_vlen, def_iter,
- def_benchmark_mode, def_kernel_regex);
+ volk_test_params_t test_params(
+ def_tol, def_scalar, def_vlen, def_iter, def_benchmark_mode, def_kernel_regex);
std::vector<volk_test_case_t> test_cases = init_test_list(test_params);
std::vector<volk_test_results_t> results;
- if (argc > 1){
- for(unsigned int ii = 0; ii < test_cases.size(); ++ii){
- if (std::string(argv[1]) == test_cases[ii].name()){
+ if (argc > 1) {
+ for (unsigned int ii = 0; ii < test_cases.size(); ++ii) {
+ if (std::string(argv[1]) == test_cases[ii].name()) {
volk_test_case_t test_case = test_cases[ii];
- if (run_volk_tests(test_case.desc(), test_case.kernel_ptr(),
+ if (run_volk_tests(test_case.desc(),
+ test_case.kernel_ptr(),
test_case.name(),
- test_case.test_parameters(), &results,
+ test_case.test_parameters(),
+ &results,
test_case.puppet_master_name())) {
- return 1;
+ return 1;
} else {
- return 0;
+ return 0;
}
}
}
- std::cerr << "Did not run a test for kernel: " << std::string(argv[1]) << " !" << std::endl;
+ std::cerr << "Did not run a test for kernel: " << std::string(argv[1]) << " !"
+ << std::endl;
return 0;
- }else{
+ } else {
std::vector<std::string> qa_failures;
// Test every kernel reporting failures when they occur
- for(unsigned int ii = 0; ii < test_cases.size(); ++ii) {
+ for (unsigned int ii = 0; ii < test_cases.size(); ++ii) {
bool qa_result = false;
volk_test_case_t test_case = test_cases[ii];
try {
- qa_result = run_volk_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(),
- test_case.test_parameters(), &results, test_case.puppet_master_name());
- }
- catch(...) {
+ qa_result = run_volk_tests(test_case.desc(),
+ test_case.kernel_ptr(),
+ test_case.name(),
+ test_case.test_parameters(),
+ &results,
+ test_case.puppet_master_name());
+ } catch (...) {
// TODO: what exceptions might we need to catch and how do we handle them?
- std::cerr << "Exception found on kernel: " << test_case.name() << std::endl;
+ std::cerr << "Exception found on kernel: " << test_case.name()
+ << std::endl;
qa_result = false;
}
- if(qa_result) {
+ if (qa_result) {
std::cerr << "Failure on " << test_case.name() << std::endl;
qa_failures.push_back(test_case.name());
}
// Summarize QA results
std::cerr << "Kernel QA finished: " << qa_failures.size() << " failures out of "
<< test_cases.size() << " tests." << std::endl;
- if(qa_failures.size() > 0) {
+ if (qa_failures.size() > 0) {
std::cerr << "The following kernels failed QA:" << std::endl;
- for(unsigned int ii = 0; ii < qa_failures.size(); ++ii) {
+ for (unsigned int ii = 0; ii < qa_failures.size(); ++ii) {
std::cerr << " " << qa_failures[ii] << std::endl;
}
qa_ret_val = 1;
qa_file.open(".unittest/kernels.xml");
qa_file << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" << std::endl;
- qa_file << "<testsuites name=\"kernels\" " <<
- "tests=\"" << results.size() << "\" " <<
- "failures=\"" << nfails << "\" id=\"1\">" << std::endl;
+ qa_file << "<testsuites name=\"kernels\" "
+ << "tests=\"" << results.size() << "\" "
+ << "failures=\"" << nfails << "\" id=\"1\">" << std::endl;
// Results are in a vector by kernel. Each element has a result
// map containing time and arch name with test result
- for(unsigned int ii=0; ii < results.size(); ++ii) {
+ for (unsigned int ii = 0; ii < results.size(); ++ii) {
volk_test_results_t result = results[ii];
qa_file << " <testsuite name=\"" << result.name << "\">" << std::endl;
std::map<std::string, volk_test_time_t>::iterator kernel_time_pair;
- for(kernel_time_pair = result.results.begin(); kernel_time_pair != result.results.end(); ++kernel_time_pair) {
+ for (kernel_time_pair = result.results.begin();
+ kernel_time_pair != result.results.end();
+ ++kernel_time_pair) {
volk_test_time_t test_time = kernel_time_pair->second;
- qa_file << " <testcase name=\"" << test_time.name << "\" " <<
- "classname=\"" << result.name << "\" " <<
- "time=\"" << test_time.time << "\">" << std::endl;
- if(!test_time.pass)
- qa_file << " <failure " <<
- "message=\"fail on arch " << test_time.name << "\">" <<
- "</failure>" << std::endl;
+ qa_file << " <testcase name=\"" << test_time.name << "\" "
+ << "classname=\"" << result.name << "\" "
+ << "time=\"" << test_time.time << "\">" << std::endl;
+ if (!test_time.pass)
+ qa_file << " <failure "
+ << "message=\"fail on arch " << test_time.name << "\">"
+ << "</failure>" << std::endl;
qa_file << " </testcase>" << std::endl;
}
qa_file << " </testsuite>" << std::endl;
qa_file << "</testsuites>" << std::endl;
qa_file.close();
-
}
* see: https://en.cppreference.com/w/c/memory/aligned_alloc
*
* MSVC is broken
- * see: https://docs.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=vs-2019
+ * see:
+ * https://docs.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=vs-2019
* This section:
* C11 The Universal CRT implemented the parts of the
* C11 Standard Library that are required by C++17,
* We must work around this problem because MSVC is non-compliant!
*/
-void *volk_malloc(size_t size, size_t alignment)
+
+void* volk_malloc(size_t size, size_t alignment)
{
#if HAVE_POSIX_MEMALIGN
- // quoting posix_memalign() man page:
- // "alignment must be a power of two and a multiple of sizeof(void *)"
- // volk_get_alignment() could return 1 for some machines (e.g. generic_orc)
- if (alignment == 1){
- return malloc(size);
- }
- void *ptr;
- int err = posix_memalign(&ptr, alignment, size);
- if(err != 0) {
- ptr = NULL;
- fprintf(stderr,
- "VOLK: Error allocating memory "
- "(posix_memalign: error %d: %s)\n", err, strerror(err));
- }
+ // quoting posix_memalign() man page:
+ // "alignment must be a power of two and a multiple of sizeof(void *)"
+ // volk_get_alignment() could return 1 for some machines (e.g. generic_orc)
+ if (alignment == 1) {
+ return malloc(size);
+ }
+ void* ptr;
+ int err = posix_memalign(&ptr, alignment, size);
+ if (err != 0) {
+ ptr = NULL;
+ fprintf(stderr,
+ "VOLK: Error allocating memory "
+ "(posix_memalign: error %d: %s)\n",
+ err,
+ strerror(err));
+ }
#elif defined(_MSC_VER)
- void *ptr = _aligned_malloc(size, alignment);
+ void* ptr = _aligned_malloc(size, alignment);
#else
- void *ptr = aligned_alloc(alignment, size);
+ void* ptr = aligned_alloc(alignment, size);
#endif
- if(ptr == NULL) {
- fprintf(stderr, "VOLK: Error allocating memory (aligned_alloc/_aligned_malloc)\n");
- }
- return ptr;
+ if (ptr == NULL) {
+ fprintf(stderr,
+ "VOLK: Error allocating memory (aligned_alloc/_aligned_malloc)\n");
+ }
+ return ptr;
}
-void volk_free(void *ptr)
+void volk_free(void* ptr)
{
#if defined(_MSC_VER)
- _aligned_free(ptr);
+ _aligned_free(ptr);
#else
- free(ptr);
+ free(ptr);
#endif
}
+#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
-#include <stdbool.h>
#include <string.h>
#if defined(_MSC_VER)
#include <io.h>
#endif
#include <volk/volk_prefs.h>
-void volk_get_config_path(char *path, bool read)
+void volk_get_config_path(char* path, bool read)
{
- if (!path) return;
- const char *suffix = "/.volk/volk_config";
- const char *suffix2 = "/volk/volk_config"; //non-hidden
- char *home = NULL;
+ if (!path)
+ return;
+ const char* suffix = "/.volk/volk_config";
+ const char* suffix2 = "/volk/volk_config"; // non-hidden
+ char* home = NULL;
- //allows config redirection via env variable
+ // allows config redirection via env variable
home = getenv("VOLK_CONFIGPATH");
- if(home!=NULL){
- strncpy(path,home,512);
- strcat(path,suffix2);
- if (!read || access(path, F_OK) != -1){
+ if (home != NULL) {
+ strncpy(path, home, 512);
+ strcat(path, suffix2);
+ if (!read || access(path, F_OK) != -1) {
return;
}
}
- //check for user-local config file
+ // check for user-local config file
home = getenv("HOME");
- if (home != NULL){
+ if (home != NULL) {
strncpy(path, home, 512);
strcat(path, suffix);
- if (!read || (access(path, F_OK) != -1)){
+ if (!read || (access(path, F_OK) != -1)) {
return;
}
}
- //check for config file in APPDATA (Windows)
+ // check for config file in APPDATA (Windows)
home = getenv("APPDATA");
- if (home != NULL){
+ if (home != NULL) {
strncpy(path, home, 512);
strcat(path, suffix);
- if (!read || (access(path, F_OK) != -1)){
+ if (!read || (access(path, F_OK) != -1)) {
return;
}
}
- //check for system-wide config file
- if (access("/etc/volk/volk_config", F_OK) != -1){
+ // check for system-wide config file
+ if (access("/etc/volk/volk_config", F_OK) != -1) {
strncpy(path, "/etc", 512);
strcat(path, suffix2);
- if (!read || (access(path, F_OK) != -1)){
+ if (!read || (access(path, F_OK) != -1)) {
return;
}
}
- //If still no path was found set path[0] to '0' and fall through
+ // If still no path was found set path[0] to '0' and fall through
path[0] = 0;
return;
}
-size_t volk_load_preferences(volk_arch_pref_t **prefs_res)
+size_t volk_load_preferences(volk_arch_pref_t** prefs_res)
{
- FILE *config_file;
+ FILE* config_file;
char path[512], line[512];
size_t n_arch_prefs = 0;
- volk_arch_pref_t *prefs = NULL;
+ volk_arch_pref_t* prefs = NULL;
- //get the config path
+ // get the config path
volk_get_config_path(path, true);
- if (!path[0]) return n_arch_prefs; //no prefs found
+ if (!path[0])
+ return n_arch_prefs; // no prefs found
config_file = fopen(path, "r");
- if(!config_file) return n_arch_prefs; //no prefs found
+ if (!config_file)
+ return n_arch_prefs; // no prefs found
- //reset the file pointer and write the prefs into volk_arch_prefs
- while(fgets(line, sizeof(line), config_file) != NULL)
- {
- void *new_prefs = realloc(prefs, (n_arch_prefs + 1) * sizeof(*prefs));
+ // reset the file pointer and write the prefs into volk_arch_prefs
+ while (fgets(line, sizeof(line), config_file) != NULL) {
+ void* new_prefs = realloc(prefs, (n_arch_prefs + 1) * sizeof(*prefs));
if (!new_prefs) {
- printf ("volk_load_preferences: bad malloc\n");
+ printf("volk_load_preferences: bad malloc\n");
break;
}
- prefs = (volk_arch_pref_t *) new_prefs;
- volk_arch_pref_t *p = prefs + n_arch_prefs;
- if(sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_", 5))
- {
+ prefs = (volk_arch_pref_t*)new_prefs;
+ volk_arch_pref_t* p = prefs + n_arch_prefs;
+ if (sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 &&
+ !strncmp(p->name, "volk_", 5)) {
n_arch_prefs++;
}
}
#include <stdlib.h>
#include <string.h>
-#include <volk_rank_archs.h>
#include <volk/volk_prefs.h>
+#include <volk_rank_archs.h>
-int volk_get_index(
- const char *impl_names[], //list of implementations by name
- const size_t n_impls, //number of implementations available
- const char *impl_name //the implementation name to find
-){
+int volk_get_index(const char* impl_names[], // list of implementations by name
+ const size_t n_impls, // number of implementations available
+ const char* impl_name // the implementation name to find
+)
+{
unsigned int i;
for (i = 0; i < n_impls; i++) {
- if(!strncmp(impl_names[i], impl_name, 20)) {
+ if (!strncmp(impl_names[i], impl_name, 20)) {
return i;
}
}
- //TODO return -1;
- //something terrible should happen here
+ // TODO return -1;
+ // something terrible should happen here
fprintf(stderr, "Volk warning: no arch found, returning generic impl\n");
- return volk_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now
+ return volk_get_index(impl_names, n_impls, "generic"); // but we'll fake it for now
}
-int volk_rank_archs(
- const char *kern_name, //name of the kernel to rank
- const char *impl_names[], //list of implementations by name
- const int* impl_deps, //requirement mask per implementation
- const bool* alignment, //alignment status of each implementation
- size_t n_impls, //number of implementations available
- const bool align //if false, filter aligned implementations
+int volk_rank_archs(const char* kern_name, // name of the kernel to rank
+ const char* impl_names[], // list of implementations by name
+ const int* impl_deps, // requirement mask per implementation
+ const bool* alignment, // alignment status of each implementation
+ size_t n_impls, // number of implementations available
+ const bool align // if false, filter aligned implementations
)
{
size_t i;
- static volk_arch_pref_t *volk_arch_prefs;
+ static volk_arch_pref_t* volk_arch_prefs;
static size_t n_arch_prefs = 0;
static int prefs_loaded = 0;
- if(!prefs_loaded) {
+ if (!prefs_loaded) {
n_arch_prefs = volk_load_preferences(&volk_arch_prefs);
prefs_loaded = 1;
}
// If we've defined VOLK_GENERIC to be anything, always return the
// 'generic' kernel. Used in GR's QA code.
- char *gen_env = getenv("VOLK_GENERIC");
- if(gen_env) {
- return volk_get_index(impl_names, n_impls, "generic");
+ char* gen_env = getenv("VOLK_GENERIC");
+ if (gen_env) {
+ return volk_get_index(impl_names, n_impls, "generic");
}
- //now look for the function name in the prefs list
- for(i = 0; i < n_arch_prefs; i++)
- {
- if(!strncmp(kern_name, volk_arch_prefs[i].name, sizeof(volk_arch_prefs[i].name))) //found it
+ // now look for the function name in the prefs list
+ for (i = 0; i < n_arch_prefs; i++) {
+ if (!strncmp(kern_name,
+ volk_arch_prefs[i].name,
+ sizeof(volk_arch_prefs[i].name))) // found it
{
- const char *impl_name = align? volk_arch_prefs[i].impl_a : volk_arch_prefs[i].impl_u;
+ const char* impl_name =
+ align ? volk_arch_prefs[i].impl_a : volk_arch_prefs[i].impl_u;
return volk_get_index(impl_names, n_impls, impl_name);
}
}
- //return the best index with the largest deps
+ // return the best index with the largest deps
size_t best_index_a = 0;
size_t best_index_u = 0;
int best_value_a = -1;
int best_value_u = -1;
- for(i = 0; i < n_impls; i++)
- {
+ for (i = 0; i < n_impls; i++) {
const signed val = impl_deps[i];
- if (alignment[i] && val > best_value_a)
- {
+ if (alignment[i] && val > best_value_a) {
best_index_a = i;
best_value_a = val;
}
- if (!alignment[i] && val > best_value_u)
- {
+ if (!alignment[i] && val > best_value_u) {
best_index_u = i;
best_value_u = val;
}
}
- //when align and we found a best aligned, use it
- if (align && best_value_a != -1) return best_index_a;
+ // when align and we found a best aligned, use it
+ if (align && best_value_a != -1)
+ return best_index_a;
- //otherwise return the best unaligned
+ // otherwise return the best unaligned
return best_index_u;
}
#ifndef INCLUDED_VOLK_RANK_ARCHS_H
#define INCLUDED_VOLK_RANK_ARCHS_H
-#include <stdlib.h>
#include <stdbool.h>
+#include <stdlib.h>
#ifdef __cplusplus
extern "C" {
#endif
-int volk_get_index(
- const char *impl_names[], //list of implementations by name
- const size_t n_impls, //number of implementations available
- const char *impl_name //the implementation name to find
+int volk_get_index(const char* impl_names[], // list of implementations by name
+ const size_t n_impls, // number of implementations available
+ const char* impl_name // the implementation name to find
);
-int volk_rank_archs(
- const char *kern_name, //name of the kernel to rank
- const char *impl_names[], //list of implementations by name
- const int* impl_deps, //requirement mask per implementation
- const bool* alignment, //alignment status of each implementation
- size_t n_impls, //number of implementations available
- const bool align //if false, filter aligned implementations
+int volk_rank_archs(const char* kern_name, // name of the kernel to rank
+ const char* impl_names[], // list of implementations by name
+ const int* impl_deps, // requirement mask per implementation
+ const bool* alignment, // alignment status of each implementation
+ size_t n_impls, // number of implementations available
+ const bool align // if false, filter aligned implementations
);
#ifdef __cplusplus