--- /dev/null
+*~
+*.pyc
+*.pyo
+build/
--- /dev/null
+#
+# Copyright 2011 Free Software Foundation, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+########################################################################
+# Project setup
+########################################################################
+cmake_minimum_required(VERSION 2.6)
+set(CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "Choose build type: None Debug Release RelWithDebInfo MinSizeRel")
+project(volk)
+enable_language(CXX)
+enable_language(C)
+enable_testing()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
+add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1)
+
+set(CMAKE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) #allows this to be a sub-project
+set(CMAKE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) #allows this to be a sub-project
+list(INSERT CMAKE_MODULE_PATH 0 ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules) #location for custom "Modules"
+
+include(VolkBuildTypes)
+#select the release build type by default to get optimization flags
+if(NOT CMAKE_BUILD_TYPE)
+ set(CMAKE_BUILD_TYPE "Release")
+ message(STATUS "Build type not specified: defaulting to release.")
+endif()
+VOLK_CHECK_BUILD_TYPE(${CMAKE_BUILD_TYPE})
+set(CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "")
+message(STATUS "Build type set to ${CMAKE_BUILD_TYPE}.")
+
+set(VERSION_INFO_MAJOR_VERSION 1)
+set(VERSION_INFO_MINOR_VERSION 3)
+set(VERSION_INFO_MAINT_VERSION 1)
+include(VolkVersion) #setup version info
+
+
+########################################################################
+# Environment setup
+########################################################################
+IF(NOT DEFINED BOOST_ROOT AND NOT DEFINED ENV{BOOST_ROOT})
+ SET(BOOST_ROOT ${CMAKE_INSTALL_PREFIX})
+ENDIF()
+
+IF(NOT DEFINED CROSSCOMPILE_MULTILIB)
+ SET(CROSSCOMPILE_MULTILIB "")
+ENDIF()
+SET(CROSSCOMPILE_MULTILIB ${CROSSCOMPILE_MULTILIB} CACHE STRING "Define \"true\" if you have and want to use multiple C development libs installed for cross compile")
+
+
+########################################################################
+# Dependencies setup
+########################################################################
+
+# Python
+include(VolkPython) #sets PYTHON_EXECUTABLE and PYTHON_DASH_B
+VOLK_PYTHON_CHECK_MODULE("python >= 2.5" sys "sys.version.split()[0] >= '2.5'" PYTHON_MIN_VER_FOUND)
+VOLK_PYTHON_CHECK_MODULE("Cheetah >= 2.0.0" Cheetah "Cheetah.Version >= '2.0.0'" CHEETAH_FOUND)
+
+if(NOT PYTHON_MIN_VER_FOUND)
+ message(FATAL_ERROR "Python 2.5 or greater required to build VOLK")
+endif()
+
+# Cheetah
+if(NOT CHEETAH_FOUND)
+ message(FATAL_ERROR "Cheetah templates required to build VOLK")
+endif()
+
+# Boost
+if(MSVC)
+ if (NOT DEFINED BOOST_ALL_DYN_LINK)
+ set(BOOST_ALL_DYN_LINK TRUE)
+ endif()
+ set(BOOST_ALL_DYN_LINK "${BOOST_ALL_DYN_LINK}" CACHE BOOL "boost enable dynamic linking")
+ if(BOOST_ALL_DYN_LINK)
+ add_definitions(-DBOOST_ALL_DYN_LINK) #setup boost auto-linking in msvc
+ else(BOOST_ALL_DYN_LINK)
+ unset(BOOST_REQUIRED_COMPONENTS) #empty components list for static link
+ endif(BOOST_ALL_DYN_LINK)
+endif(MSVC)
+
+include(VolkBoost)
+
+if(NOT Boost_FOUND)
+ message(FATAL_ERROR "VOLK Requires boost to build")
+endif()
+
+# Orc
+option(ENABLE_ORC "Enable Orc" True)
+if(ENABLE_ORC)
+ find_package(ORC)
+else(ENABLE_ORC)
+ message(STATUS "Disabling use of ORC")
+endif(ENABLE_ORC)
+
+########################################################################
+# Setup doxygen
+########################################################################
+find_package(Doxygen)
+if(DOXYGEN_FOUND)
+ configure_file(
+ ${CMAKE_SOURCE_DIR}/Doxyfile.in
+ ${CMAKE_BINARY_DIR}/Doxyfile
+ @ONLY)
+
+ add_custom_target(doc
+ ${DOXYGEN_EXECUTABLE} ${CMAKE_BINARY_DIR}/Doxyfile
+ WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+ COMMENT "Generating documentation with Doxygen" VERBATIM
+ )
+endif(DOXYGEN_FOUND)
+
+########################################################################
+# Setup the package config file
+########################################################################
+#set variables found in the pc.in file
+set(prefix ${CMAKE_INSTALL_PREFIX})
+set(exec_prefix "\${prefix}")
+set(libdir "\${exec_prefix}/lib${LIB_SUFFIX}")
+set(includedir "\${prefix}/include")
+
+configure_file(
+ ${CMAKE_CURRENT_SOURCE_DIR}/volk.pc.in
+ ${CMAKE_CURRENT_BINARY_DIR}/volk.pc
+@ONLY)
+
+install(
+ FILES ${CMAKE_CURRENT_BINARY_DIR}/volk.pc
+ DESTINATION lib${LIB_SUFFIX}/pkgconfig
+ COMPONENT "volk_devel"
+)
+
+########################################################################
+# Install all headers in the include directories
+########################################################################
+set(VOLK_RUNTIME_DIR bin)
+set(VOLK_LIBRARY_DIR lib${LIB_SUFFIX})
+set(VOLK_INCLUDE_DIR include)
+
+install(
+ DIRECTORY ${CMAKE_SOURCE_DIR}/kernels/volk
+ DESTINATION include COMPONENT "volk_devel"
+ FILES_MATCHING PATTERN "*.h"
+)
+
+install(FILES
+ ${CMAKE_SOURCE_DIR}/include/volk/volk_prefs.h
+ ${CMAKE_SOURCE_DIR}/include/volk/volk_complex.h
+ ${CMAKE_SOURCE_DIR}/include/volk/volk_common.h
+ ${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h
+ ${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h
+ ${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h
+ ${CMAKE_BINARY_DIR}/include/volk/volk.h
+ ${CMAKE_BINARY_DIR}/include/volk/volk_cpu.h
+ ${CMAKE_BINARY_DIR}/include/volk/volk_config_fixed.h
+ ${CMAKE_BINARY_DIR}/include/volk/volk_typedefs.h
+ ${CMAKE_SOURCE_DIR}/include/volk/volk_malloc.h
+ DESTINATION include/volk
+ COMPONENT "volk_devel"
+)
+
+########################################################################
+# On Apple only, set install name and use rpath correctly, if not already set
+########################################################################
+if(APPLE)
+ if(NOT CMAKE_INSTALL_NAME_DIR)
+ set(CMAKE_INSTALL_NAME_DIR
+ ${CMAKE_INSTALL_PREFIX}/${VOLK_LIBRARY_DIR} CACHE
+ PATH "Library Install Name Destination Directory" FORCE)
+ endif(NOT CMAKE_INSTALL_NAME_DIR)
+ if(NOT CMAKE_INSTALL_RPATH)
+ set(CMAKE_INSTALL_RPATH
+ ${CMAKE_INSTALL_PREFIX}/${VOLK_LIBRARY_DIR} CACHE
+ PATH "Library Install RPath" FORCE)
+ endif(NOT CMAKE_INSTALL_RPATH)
+ if(NOT CMAKE_BUILD_WITH_INSTALL_RPATH)
+ set(CMAKE_BUILD_WITH_INSTALL_RPATH ON CACHE
+ BOOL "Do Build Using Library Install RPath" FORCE)
+ endif(NOT CMAKE_BUILD_WITH_INSTALL_RPATH)
+endif(APPLE)
+
+########################################################################
+# Create uninstall target
+########################################################################
+configure_file(
+ ${CMAKE_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in
+ ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake
+@ONLY)
+
+# Only add the target if there isn't one defined already
+if(NOT TARGET uninstall)
+ add_custom_target(uninstall
+ ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake
+ )
+endif()
+
+
+########################################################################
+# Install our Cmake modules into $prefix/lib/cmake/volk
+# See "Package Configuration Files" on page:
+# http://www.cmake.org/Wiki/CMake/Tutorials/Packaging
+########################################################################
+
+configure_file(
+ ${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfig.cmake.in
+ ${CMAKE_BINARY_DIR}/cmake/Modules/VolkConfig.cmake
+@ONLY)
+
+configure_file(
+ ${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfigVersion.cmake.in
+ ${CMAKE_BINARY_DIR}/cmake/Modules/VolkConfigVersion.cmake
+@ONLY)
+
+
+########################################################################
+# Install cmake search routine for external use
+########################################################################
+
+if(NOT CMAKE_MODULES_DIR)
+ set(CMAKE_MODULES_DIR lib${LIB_SUFFIX}/cmake)
+endif(NOT CMAKE_MODULES_DIR)
+
+install(
+ FILES
+ ${CMAKE_CURRENT_BINARY_DIR}/cmake/Modules/VolkConfig.cmake
+ ${CMAKE_CURRENT_BINARY_DIR}/cmake/Modules/VolkConfigVersion.cmake
+ DESTINATION ${CMAKE_MODULES_DIR}/volk
+ COMPONENT "volk_devel"
+)
+
+########################################################################
+# Option to enable QA testing, on by default
+########################################################################
+OPTION(ENABLE_TESTING "Enable QA testing" ON)
+if(ENABLE_TESTING)
+ message(STATUS "QA Testing is enabled.")
+else()
+ message(STATUS "QA Testing is disabled.")
+endif()
+message(STATUS " Modify using: -DENABLE_TESTING=ON/OFF")
+
+########################################################################
+# Option to enable post-build profiling using volk_profile, off by default
+########################################################################
+OPTION(ENABLE_PROFILING "Launch system profiler after build" OFF)
+if(ENABLE_PROFILING)
+ if(DEFINED VOLK_CONFIGPATH)
+ get_filename_component(VOLK_CONFIGPATH ${VOLK_CONFIGPATH} ABSOLUTE)
+ set(VOLK_CONFIGPATH "${VOLK_CONFIGPATH}/volk")
+ message(STATUS "System profiling is enabled, using path: ${VOLK_CONFIGPATH}")
+ elseif(DEFINED ENV{VOLK_CONFIGPATH})
+ set(VOLK_CONFIGPATH "$ENV{VOLK_CONFIGPATH}/volk")
+ message(STATUS "System profiling is enabled, using env path: $ENV{VOLK_CONFIGPATH}")
+ else()
+ message(STATUS "System profiling is enabled with default paths.")
+ if(DEFINED ENV{HOME})
+ set(VOLK_CONFIGPATH "$ENV{HOME}/.volk")
+ elseif(DEFINED ENV{APPDATA})
+ set(VOLK_CONFIGPATH "$ENV{APPDATA}/.volk")
+ endif()
+ endif()
+else()
+ message(STATUS "System profiling is disabled.")
+endif()
+message(STATUS " Modify using: -DENABLE_PROFILING=ON/OFF")
+
+########################################################################
+# Setup the library
+########################################################################
+add_subdirectory(lib)
+
+########################################################################
+# And the utility apps
+########################################################################
+add_subdirectory(apps)
+add_subdirectory(python/volk_modtool)
+
+########################################################################
+# Print summary
+########################################################################
+message(STATUS "Using install prefix: ${CMAKE_INSTALL_PREFIX}")
--- /dev/null
+ GNU GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users. We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors. You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights. Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received. You must make sure that they, too, receive
+or can get the source code. And you must show them these terms so they
+know their rights.
+
+ Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+ For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software. For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+ Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so. This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software. The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products. If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+ Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary. To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Use with the GNU Affero General Public License.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+ <program> Copyright (C) <year> <name of author>
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+ The GNU General Public License does not permit incorporating your program
+into proprietary programs. If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License. But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
--- /dev/null
+# Doxyfile 1.8.6
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all text
+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
+# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
+# for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME = "Vector Optimized Library of Kernels"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER = @VERSION@
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF = "Architecture-tuned implementations of math kernels"
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is included in
+# the documentation. The maximum height of the logo should not exceed 55 pixels
+# and the maximum width should not exceed 200 pixels. Doxygen will copy the logo
+# to the output directory.
+
+PROJECT_LOGO =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+# TODO: configure this to be a special docs directory. nw tried, but running
+# make doc won' create the directory, but with doxygen it will. why?
+
+OUTPUT_DIRECTORY =
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC = YES
+
+# If the REPEAT_BRIEF tag is set to YES doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF = "The $name class" \
+ "The $name widget" \
+ "The $name file" \
+ is \
+ provides \
+ specifies \
+ contains \
+ represents \
+ a \
+ an \
+ the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB = NO
+
+# If the FULL_PATH_NAMES tag is set to YES doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce a
+# new page for each member. If set to NO, the documentation of a member will be
+# part of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines.
+
+ALIASES =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C = YES
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# C#, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL. For instance to make
+# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
+# (default is Fortran), use: inc=Fortran f=C.
+#
+# Note For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT = YES
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by by putting a % sign in front of the word
+# or globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL = YES
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES = YES
+
+# This flag is only useful for Objective-C code. When set to YES local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS = YES
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO these classes will be included in the various overviews. This option has
+# no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES = YES
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING = NO
+
+# The GENERATE_TODOLIST tag can be used to enable ( YES) or disable ( NO) the
+# todo list. This list is created by putting \todo commands in the
+# documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST = YES
+
+# The GENERATE_TESTLIST tag can be used to enable ( YES) or disable ( NO) the
+# test list. This list is created by putting \test commands in the
+# documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST = YES
+
+# The GENERATE_BUGLIST tag can be used to enable ( YES) or disable ( NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable ( YES) or disable ( NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES the list
+# will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE = @CMAKE_SOURCE_DIR@/DoxygenLayout.xml
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. Do not use file names with spaces, bibtex cannot handle them. See
+# also \cite for info how to create references.
+
+CITE_BIB_FILES =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error ( stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES, then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO doxygen will only warn about wrong or incomplete parameter
+# documentation, but not about the absence of documentation.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces.
+# Note: If this tag is empty the current directory is searched.
+
+INPUT = @CMAKE_SOURCE_DIR@
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: http://www.gnu.org/software/libiconv) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank the
+# following patterns are tested:*.c, *.cc, *.cxx, *.cpp, *.c++, *.java, *.ii,
+# *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp,
+# *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown,
+# *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf,
+# *.qsf, *.as and *.js.
+
+FILE_PATTERNS = *.c \
+ *.cc \
+ *.cxx \
+ *.cpp \
+ *.c++ \
+ *.java \
+ *.ii \
+ *.ixx \
+ *.ipp \
+ *.i++ \
+ *.inl \
+ *.idl \
+ *.ddl \
+ *.odl \
+ *.h \
+ *.hh \
+ *.hxx \
+ *.hpp \
+ *.h++ \
+ *.cs \
+ *.d \
+ *.php \
+ *.php4 \
+ *.php5 \
+ *.phtml \
+ *.inc \
+ *.m \
+ *.markdown \
+ *.md \
+ *.mm \
+ *.dox \
+ *.py \
+ *.f90 \
+ *.f \
+ *.for \
+ *.tcl \
+ *.vhd \
+ *.vhdl \
+ *.ucf \
+ *.qsf \
+ *.as \
+ *.js
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE = @CMAKE_BINARY_DIR@
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+
+INPUT_FILTER =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER ) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# function all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES, then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see http://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX = YES
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER =
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional user-
+# defined cascading style sheet that is included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefor more robust against future updates.
+# Doxygen will copy the style sheet file to the output directory. For an example
+# see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the stylesheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP = NO
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: http://developer.apple.com/tools/xcode/), introduced with
+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler ( hhc.exe). If non-empty
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated (
+# YES) or that it should be included in the master .chm file ( NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index ( hhk), content ( hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated (
+# YES) or a normal table of contents ( NO) in the .chm file.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX = YES
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom stylesheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW = YES
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# http://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using prerendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX = NO
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from http://www.mathjax.org before deployment.
+# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavours of web server based searching depending on the
+# EXTERNAL_SEARCH setting. When disabled, doxygen will generate a PHP script for
+# searching and an index file used by the script. When EXTERNAL_SEARCH is
+# enabled the indexing and searching needs to be provided by external tools. See
+# the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer ( doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer ( doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when enabling USE_PDFLATEX this option is only used for generating
+# bitmaps for formulas in the HTML output, but not in the Makefile that is
+# written to the output directory.
+# The default file is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME = makeindex
+
+# If the COMPACT_LATEX tag is set to YES doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. To get the times font for
+# instance you can specify
+# EXTRA_PACKAGES=times
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber. Doxygen will
+# replace them by respectively the title of the page, the current date and time,
+# only the current date, the version number of doxygen, the project name (see
+# PROJECT_NAME), or the project number (see PROJECT_NUMBER).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS = YES
+
+# If the LATEX_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE = plain
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT = rtf
+
+# If the COMPACT_RTF tag is set to YES doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's config
+# file, i.e. a series of assignments. You only have to provide replacements,
+# missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's config file. A template extensions file can be generated
+# using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION = .3
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT = xml
+
+# The XML_SCHEMA tag can be used to specify a XML schema, which can be used by a
+# validating XML parser to check the syntax of the XML files.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_SCHEMA =
+
+# The XML_DTD tag can be used to specify a XML DTD, which can be used by a
+# validating XML parser to check the syntax of the XML files.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_DTD =
+
+# If the XML_PROGRAMLISTING tag is set to YES doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT = docbook
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES doxygen will generate an AutoGen
+# Definitions (see http://autogen.sf.net) file that captures the structure of
+# the code including all documentation. Note that this feature is still
+# experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD = NO
+
+# If the PERLMOD_LATEX tag is set to YES doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING = YES
+
+# If the MACRO_EXPANSION tag is set to YES doxygen will expand all macro names
+# in the source code. If set to NO only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF = NO
+
+# If the SEARCH_INCLUDES tag is set to YES the includes files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED = LV_HAVE_GENERIC \
+ LV_HAVE_SSE \
+ LV_HAVE_SSE2 \
+ LV_HAVE_NEON \
+ LV_HAVE_AVX \
+ LV_HAVE_SSE4_2 \
+ LV_HAVE_SSE3 \
+ LV_HAVE_SSSE3
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all refrences to function-like macros that are alone on a line, have an
+# all uppercase name, and do not end with a semicolon. Such function macros are
+# typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have an unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE =
+
+# If the ALLEXTERNALS tag is set to YES all external class will be listed in the
+# class index. If set to NO only the inherited external classes will be listed.
+# The default value is: NO.
+
+ALLEXTERNALS = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed in
+# the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS = YES
+
+# If the EXTERNAL_PAGES tag is set to YES all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of 'which perl').
+# The default file (with absolute path) is: /usr/bin/perl.
+
+PERL_PATH = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS = NO
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see:
+# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH =
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH =
+
+# If set to YES, the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: NO.
+
+HAVE_DOT = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS = 0
+
+# When you want a differently looking font n the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot.
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, jpg, gif and svg.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND = YES
+
+# If the DOT_CLEANUP tag is set to YES doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP = YES
--- /dev/null
+<doxygenlayout version="1.0">
+ <!-- Generated by doxygen 1.8.6 -->
+ <!-- Navigation index tabs for HTML output -->
+ <navindex>
+ <tab type="mainpage" visible="yes" title=""/>
+ <tab type="pages" visible="yes" title="" intro=""/>
+ <tab type="modules" visible="yes" title="" intro=""/>
+ <!--
+ <tab type="namespaces" visible="yes" title="">
+ <tab type="namespacelist" visible="yes" title="" intro=""/>
+ <tab type="namespacemembers" visible="yes" title="" intro=""/>
+ </tab>
+ <tab type="classes" visible="yes" title="">
+ <tab type="classlist" visible="yes" title="" intro=""/>
+ <tab type="classindex" visible="$ALPHABETICAL_INDEX" title=""/>
+ <tab type="hierarchy" visible="yes" title="" intro=""/>
+ <tab type="classmembers" visible="yes" title="" intro=""/>
+ </tab>
+ -->
+ <tab type="files" visible="yes" title="">
+ <tab type="filelist" visible="yes" title="" intro=""/>
+ <tab type="globals" visible="yes" title="" intro=""/>
+ </tab>
+ <tab type="examples" visible="yes" title="" intro=""/>
+ </navindex>
+
+ <!-- Layout definition for a class page -->
+ <class>
+ <briefdescription visible="yes"/>
+ <includes visible="$SHOW_INCLUDE_FILES"/>
+ <inheritancegraph visible="$CLASS_GRAPH"/>
+ <collaborationgraph visible="$COLLABORATION_GRAPH"/>
+ <memberdecl>
+ <nestedclasses visible="yes" title=""/>
+ <publictypes title=""/>
+ <services title=""/>
+ <interfaces title=""/>
+ <publicslots title=""/>
+ <signals title=""/>
+ <publicmethods title=""/>
+ <publicstaticmethods title=""/>
+ <publicattributes title=""/>
+ <publicstaticattributes title=""/>
+ <protectedtypes title=""/>
+ <protectedslots title=""/>
+ <protectedmethods title=""/>
+ <protectedstaticmethods title=""/>
+ <protectedattributes title=""/>
+ <protectedstaticattributes title=""/>
+ <packagetypes title=""/>
+ <packagemethods title=""/>
+ <packagestaticmethods title=""/>
+ <packageattributes title=""/>
+ <packagestaticattributes title=""/>
+ <properties title=""/>
+ <events title=""/>
+ <privatetypes title=""/>
+ <privateslots title=""/>
+ <privatemethods title=""/>
+ <privatestaticmethods title=""/>
+ <privateattributes title=""/>
+ <privatestaticattributes title=""/>
+ <friends title=""/>
+ <related title="" subtitle=""/>
+ <membergroups visible="yes"/>
+ </memberdecl>
+ <detaileddescription title=""/>
+ <memberdef>
+ <inlineclasses title=""/>
+ <typedefs title=""/>
+ <enums title=""/>
+ <services title=""/>
+ <interfaces title=""/>
+ <constructors title=""/>
+ <functions title=""/>
+ <related title=""/>
+ <variables title=""/>
+ <properties title=""/>
+ <events title=""/>
+ </memberdef>
+ <allmemberslink visible="yes"/>
+ <usedfiles visible="$SHOW_USED_FILES"/>
+ <authorsection visible="yes"/>
+ </class>
+
+ <!-- Layout definition for a namespace page -->
+ <namespace>
+ <briefdescription visible="yes"/>
+ <memberdecl>
+ <nestednamespaces visible="yes" title=""/>
+ <constantgroups visible="yes" title=""/>
+ <classes visible="yes" title=""/>
+ <typedefs title=""/>
+ <enums title=""/>
+ <functions title=""/>
+ <variables title=""/>
+ <membergroups visible="yes"/>
+ </memberdecl>
+ <detaileddescription title=""/>
+ <memberdef>
+ <inlineclasses title=""/>
+ <typedefs title=""/>
+ <enums title=""/>
+ <functions title=""/>
+ <variables title=""/>
+ </memberdef>
+ <authorsection visible="yes"/>
+ </namespace>
+
+ <!-- Layout definition for a file page -->
+ <file>
+ <briefdescription visible="yes"/>
+ <includes visible="$SHOW_INCLUDE_FILES"/>
+ <includegraph visible="$INCLUDE_GRAPH"/>
+ <includedbygraph visible="$INCLUDED_BY_GRAPH"/>
+ <sourcelink visible="yes"/>
+ <memberdecl>
+ <classes visible="yes" title=""/>
+ <namespaces visible="yes" title=""/>
+ <constantgroups visible="yes" title=""/>
+ <defines title=""/>
+ <typedefs title=""/>
+ <enums title=""/>
+ <functions title=""/>
+ <variables title=""/>
+ <membergroups visible="yes"/>
+ </memberdecl>
+ <detaileddescription title=""/>
+ <memberdef>
+ <inlineclasses title=""/>
+ <defines title=""/>
+ <typedefs title=""/>
+ <enums title=""/>
+ <functions title=""/>
+ <variables title=""/>
+ </memberdef>
+ <authorsection/>
+ </file>
+
+ <!-- Layout definition for a group page -->
+ <group>
+ <briefdescription visible="yes"/>
+ <groupgraph visible="$GROUP_GRAPHS"/>
+ <memberdecl>
+ <nestedgroups visible="yes" title=""/>
+ <dirs visible="yes" title=""/>
+ <files visible="yes" title=""/>
+ <namespaces visible="yes" title=""/>
+ <classes visible="yes" title=""/>
+ <defines title=""/>
+ <typedefs title=""/>
+ <enums title=""/>
+ <enumvalues title=""/>
+ <functions title=""/>
+ <variables title=""/>
+ <signals title=""/>
+ <publicslots title=""/>
+ <protectedslots title=""/>
+ <privateslots title=""/>
+ <events title=""/>
+ <properties title=""/>
+ <friends title=""/>
+ <membergroups visible="yes"/>
+ </memberdecl>
+ <detaileddescription title=""/>
+ <memberdef>
+ <pagedocs/>
+ <inlineclasses title=""/>
+ <defines title=""/>
+ <typedefs title=""/>
+ <enums title=""/>
+ <enumvalues title=""/>
+ <functions title=""/>
+ <variables title=""/>
+ <signals title=""/>
+ <publicslots title=""/>
+ <protectedslots title=""/>
+ <privateslots title=""/>
+ <events title=""/>
+ <properties title=""/>
+ <friends title=""/>
+ </memberdef>
+ <authorsection visible="yes"/>
+ </group>
+
+ <!-- Layout definition for a directory page -->
+ <directory>
+ <briefdescription visible="yes"/>
+ <directorygraph visible="yes"/>
+ <memberdecl>
+ <dirs visible="yes"/>
+ <files visible="yes"/>
+ </memberdecl>
+ <detaileddescription title=""/>
+ </directory>
+</doxygenlayout>
--- /dev/null
+#
+# Copyright 2015 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# VOLK is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# VOLK is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING. If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+#
+
+Welcome to VOLK!
+
+VOLK is a sub-project of GNU Radio. Please see http://libvolk.org for bug
+tracking, documentation, source code, and contact information about VOLK.
+See http://gnuradio.org/ for information about GNU Radio.
+
+Bleeding edge code can be found in our git repository at
+http://gnuradio.org/git/volk.git/.
+
+How to use VOLK:
+
+For detailed instructions see http://libvolk.org/doxygen/using_volk.html
+
+See these steps for a quick build guide.
+
+Build with cmake:
+
+ $ mkdir build
+ $ cd build
+ $ cmake ..
+ $ make
+ $ make test
+ $ sudo make install
+
+That's it!
--- /dev/null
+#
+# Copyright 2011-2013 Free Software Foundation, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+########################################################################
+# Setup profiler
+########################################################################
+if(MSVC)
+ include_directories(${PROJECT_SOURCE_DIR}/cmake/msvc)
+endif(MSVC)
+
+include_directories(
+ ${CMAKE_CURRENT_SOURCE_DIR}
+ ${CMAKE_CURRENT_BINARY_DIR}
+ ${PROJECT_SOURCE_DIR}/include
+ ${PROJECT_BINARY_DIR}/include
+ ${PROJECT_SOURCE_DIR}/lib
+ ${PROJECT_BINARY_DIR}/lib
+ ${Boost_INCLUDE_DIRS}
+)
+
+# MAKE volk_profile
+add_executable(volk_profile
+ ${CMAKE_CURRENT_SOURCE_DIR}/volk_profile.cc
+ ${PROJECT_SOURCE_DIR}/lib/qa_utils.cc
+)
+
+
+if(ENABLE_STATIC_LIBS)
+ target_link_libraries(volk_profile volk_static ${Boost_LIBRARIES})
+ set_target_properties(volk_profile PROPERTIES LINK_FLAGS "-static")
+else()
+ target_link_libraries(volk_profile volk ${Boost_LIBRARIES})
+endif()
+
+install(
+ TARGETS volk_profile
+ DESTINATION bin
+ COMPONENT "volk"
+)
+
+# MAKE volk-config-info
+add_executable(volk-config-info volk-config-info.cc)
+
+if(ENABLE_STATIC_LIBS)
+ target_link_libraries(volk-config-info volk_static ${Boost_LIBRARIES})
+ set_target_properties(volk-config-info PROPERTIES LINK_FLAGS "-static")
+else()
+ target_link_libraries(volk-config-info volk ${Boost_LIBRARIES})
+endif()
+
+install(
+ TARGETS volk-config-info
+ DESTINATION bin
+ COMPONENT "volk"
+)
+
+# Launch volk_profile if requested to do so
+if(ENABLE_PROFILING)
+ if(DEFINED VOLK_CONFIGPATH)
+ set( VOLK_CONFIG_ARG "-p${VOLK_CONFIGPATH}" )
+ set( VOLK_CONFIG "${VOLK_CONFIGPATH}/volk_config" )
+ endif()
+
+ add_custom_command(OUTPUT ${VOLK_CONFIG}
+ COMMAND volk_profile "${VOLK_CONFIG_ARG}"
+ DEPENDS volk_profile
+ COMMENT "Launching profiler, this may take a few minutes..."
+ )
+ add_custom_target(volk-profile-run ALL DEPENDS ${VOLK_CONFIG})
+
+endif()
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2013, 2016 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#if HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <volk/constants.h>
+#include "volk/volk.h"
+#include <boost/program_options.hpp>
+#include <iostream>
+
+namespace po = boost::program_options;
+
+int
+main(int argc, char **argv)
+{
+ po::options_description desc("Program options: volk-config-info [options]");
+ po::variables_map vm;
+
+ desc.add_options()
+ ("help,h", "print help message")
+ ("prefix", "print VOLK installation prefix")
+ ("cc", "print VOLK C compiler version")
+ ("cflags", "print VOLK CFLAGS")
+ ("all-machines", "print VOLK machines built into library")
+ ("avail-machines", "print VOLK machines the current platform can use")
+ ("machine", "print the VOLK machine that will be used")
+ ("alignment", "print the alignment that will be used")
+ ("malloc", "print malloc implementation that will be used")
+ ("version,v", "print VOLK version")
+ ;
+
+ try {
+ po::store(po::parse_command_line(argc, argv, desc), vm);
+ po::notify(vm);
+ }
+ catch (po::error& error){
+ std::cerr << "Error: " << error.what() << std::endl << std::endl;
+ std::cerr << desc << std::endl;
+ return 1;
+ }
+
+ if(vm.size() == 0 || vm.count("help")) {
+ std::cout << desc << std::endl;
+ return 1;
+ }
+
+ if(vm.count("prefix"))
+ std::cout << volk_prefix() << std::endl;
+
+ if(vm.count("version"))
+ std::cout << volk_version() << std::endl;
+
+ if(vm.count("cc"))
+ std::cout << volk_c_compiler() << std::endl;
+
+ if(vm.count("cflags"))
+ std::cout << volk_compiler_flags() << std::endl;
+
+ // stick an extra ';' to make output of this and avail-machines the
+ // same structure for easier parsing
+ if(vm.count("all-machines"))
+ std::cout << volk_available_machines() << ";" << std::endl;
+
+ if(vm.count("avail-machines")) {
+ volk_list_machines();
+ }
+
+ if(vm.count("machine")) {
+ std::cout << volk_get_machine() << std::endl;
+ }
+
+ if(vm.count("alignment")) {
+ std::cout << "Alignment in bytes: " << volk_get_alignment() << std::endl;
+ }
+
+ // You don't want to change the volk_malloc code, so just copy the if/else
+ // structure from there and give an explanation for the implementations
+ if(vm.count("malloc")) {
+ std::cout << "Used malloc implementation: ";
+#if _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN
+ std::cout << "posix_memalign" << std::endl;
+#elif _MSC_VER >= 1400
+ std::cout << "aligned_malloc" << std::endl;
+#else
+ std::cout << "No standard handler available, using own implementation." << std::endl;
+#endif
+ }
+
+ return 0;
+}
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012-2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include "qa_utils.h"
+#include "kernel_tests.h"
+#include "volk_profile.h"
+
+#include <volk/volk.h>
+#include <volk/volk_prefs.h>
+
+#include <ciso646>
+#include <vector>
+#include <boost/filesystem.hpp>
+#include <boost/program_options.hpp>
+#include <boost/xpressive/xpressive.hpp>
+#include <iostream>
+#include <fstream>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+namespace fs = boost::filesystem;
+
+int main(int argc, char *argv[]) {
+ // Adding program options
+ boost::program_options::options_description desc("Options");
+ desc.add_options()
+ ("help,h", "Print help messages")
+ ("benchmark,b",
+ boost::program_options::value<bool>()->default_value( false )
+ ->implicit_value( true ),
+ "Run all kernels (benchmark mode)")
+ ("tol,t",
+ boost::program_options::value<float>()->default_value( 1e-6 ),
+ "Set the default error tolerance for tests")
+ ("vlen,v",
+ boost::program_options::value<int>()->default_value( 131071 ),
+ "Set the default vector length for tests") // default is a mersenne prime
+ ("iter,i",
+ boost::program_options::value<int>()->default_value( 1987 ),
+ "Set the default number of test iterations per kernel")
+ ("tests-regex,R",
+ boost::program_options::value<std::string>(),
+ "Run tests matching regular expression.")
+ ("update,u",
+ boost::program_options::value<bool>()->default_value( false )
+ ->implicit_value( true ),
+ "Run only kernels missing from config; use -R to further restrict the candidates")
+ ("dry-run,n",
+ boost::program_options::value<bool>()->default_value( false )
+ ->implicit_value( true ),
+ "Dry run. Respect other options, but don't write to file")
+ ("json,j",
+ boost::program_options::value<std::string>(),
+ "JSON output file")
+ ("path,p",
+ boost::program_options::value<std::string>(),
+ "Specify volk_config path.")
+ ;
+
+ // Handle the options that were given
+ boost::program_options::variables_map vm;
+ bool benchmark_mode;
+ std::string kernel_regex;
+ std::ofstream json_file;
+ float def_tol;
+ lv_32fc_t def_scalar;
+ int def_iter;
+ int def_vlen;
+ bool def_benchmark_mode;
+ std::string def_kernel_regex;
+ bool update_mode = false;
+ bool dry_run = false;
+ std::string config_file;
+
+ // Handle the provided options
+ try {
+ boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), vm);
+ boost::program_options::notify(vm);
+ benchmark_mode = vm.count("benchmark")?vm["benchmark"].as<bool>():false;
+ if ( vm.count("tests-regex" ) ) {
+ kernel_regex = vm["tests-regex"].as<std::string>();
+ }
+ else {
+ kernel_regex = ".*";
+ }
+
+ def_tol = vm["tol"].as<float>();
+ def_scalar = 327.0;
+ def_vlen = vm["vlen"].as<int>();
+ def_iter = vm["iter"].as<int>();
+ def_benchmark_mode = benchmark_mode;
+ def_kernel_regex = kernel_regex;
+ update_mode = vm["update"].as<bool>();
+ dry_run = vm["dry-run"].as<bool>();
+ }
+ catch (boost::program_options::error& error) {
+ std::cerr << "Error: " << error.what() << std::endl << std::endl;
+ std::cerr << desc << std::endl;
+ return 1;
+ }
+
+ /** --help option */
+ if ( vm.count("help") ) {
+ std::cout << "The VOLK profiler." << std::endl
+ << desc << std::endl;
+ return 0;
+ }
+
+ if ( vm.count("json") ) {
+ std::string filename;
+ try {
+ filename = vm["json"].as<std::string>();
+ }
+ catch (boost::bad_any_cast& error) {
+ std::cerr << error.what() << std::endl;
+ return 1;
+ }
+ json_file.open( filename.c_str() );
+ }
+
+ if ( vm.count("path") ) {
+ try {
+ config_file = vm["path"].as<std::string>() + "/volk_config";
+ }
+ catch (boost::bad_any_cast& error) {
+ std::cerr << error.what() << std::endl;
+ return 1;
+ }
+ }
+
+ volk_test_params_t test_params(def_tol, def_scalar, def_vlen, def_iter,
+ def_benchmark_mode, def_kernel_regex);
+
+ // Run tests
+ std::vector<volk_test_results_t> results;
+ if(update_mode) {
+ if( vm.count("path") ) read_results(&results, config_file);
+ else read_results(&results);
+ }
+
+
+ // Initialize the list of tests
+ // the default test parameters come from options
+ std::vector<volk_test_case_t> test_cases = init_test_list(test_params);
+ boost::xpressive::sregex kernel_expression;
+ try {
+ kernel_expression = boost::xpressive::sregex::compile(kernel_regex);
+ }
+ catch (boost::xpressive::regex_error& error) {
+ std::cerr << "Error occured while compiling regex" << std::endl << std::endl;
+ return 1;
+ }
+
+ // Iteratate through list of tests running each one
+ for(unsigned int ii = 0; ii < test_cases.size(); ++ii) {
+ bool regex_match = true;
+
+ volk_test_case_t test_case = test_cases[ii];
+ // if the kernel name matches regex then do the test
+ if(boost::xpressive::regex_search(test_case.name(), kernel_expression)) {
+ regex_match = true;
+ }
+ else {
+ regex_match = false;
+ }
+
+ // if we are in update mode check if we've already got results
+ // if we have any, then no need to test that kernel
+ bool update = true;
+ if(update_mode) {
+ for(unsigned int jj=0; jj < results.size(); ++jj) {
+ if(results[jj].name == test_case.name() ||
+ results[jj].name == test_case.puppet_master_name()) {
+ update = false;
+ break;
+ }
+ }
+ }
+
+ if( regex_match && update ) {
+ try {
+ run_volk_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(),
+ test_case.test_parameters(), &results, test_case.puppet_master_name());
+ }
+ catch (std::string error) {
+ std::cerr << "Caught Exception in 'run_volk_tests': " << error << std::endl;
+ }
+
+ }
+ }
+
+
+ // Output results according to provided options
+ if(vm.count("json")) {
+ write_json(json_file, results);
+ json_file.close();
+ }
+
+ if(!dry_run) {
+ if(vm.count("path")) write_results(&results, false, config_file);
+ else write_results(&results, false);
+ }
+ else {
+ std::cout << "Warning: this was a dry-run. Config not generated" << std::endl;
+ }
+}
+
+void read_results(std::vector<volk_test_results_t> *results)
+{
+ char path[1024];
+ volk_get_config_path(path);
+
+ read_results(results, std::string(path));
+}
+
+void read_results(std::vector<volk_test_results_t> *results, std::string path)
+{
+ const fs::path config_path(path);
+
+ if(fs::exists(config_path)) {
+ // a config exists and we are reading results from it
+ std::ifstream config(config_path.string().c_str());
+ char config_line[256];
+ while(config.getline(config_line, 255)) {
+ // tokenize the input line by kernel_name unaligned aligned
+ // then push back in the results vector with fields filled in
+
+ std::vector<std::string> single_kernel_result;
+ std::string config_str(config_line);
+ std::size_t str_size = config_str.size();
+ std::size_t found = 1;
+
+ found = config_str.find(" ");
+ // Split line by spaces
+ while(found && found < str_size) {
+ found = config_str.find(" ");
+ // kernel names MUST be less than 128 chars, which is
+ // a length restricted by volk/volk_prefs.c
+ // on the last token in the parsed string we won't find a space
+ // so make sure we copy at most 128 chars.
+ if(found > 127) {
+ found = 127;
+ }
+ str_size = config_str.size();
+ char buffer[128] = {'\0'};
+ config_str.copy(buffer, found + 1, 0);
+ buffer[found] = '\0';
+ single_kernel_result.push_back(std::string(buffer));
+ config_str.erase(0, found+1);
+ }
+
+ if(single_kernel_result.size() == 3) {
+ volk_test_results_t kernel_result;
+ kernel_result.name = std::string(single_kernel_result[0]);
+ kernel_result.config_name = std::string(single_kernel_result[0]);
+ kernel_result.best_arch_u = std::string(single_kernel_result[1]);
+ kernel_result.best_arch_a = std::string(single_kernel_result[2]);
+ results->push_back(kernel_result);
+ }
+ }
+ }
+
+}
+
+void write_results(const std::vector<volk_test_results_t> *results, bool update_result)
+{
+ char path[1024];
+ volk_get_config_path(path);
+
+ write_results( results, update_result, std::string(path));
+}
+
+void write_results(const std::vector<volk_test_results_t> *results, bool update_result, const std::string path)
+{
+ const fs::path config_path(path);
+
+ // Until we can update the config on a kernel by kernel basis
+ // do not overwrite volk_config when using a regex.
+ if (not fs::exists(config_path.branch_path()))
+ {
+ std::cout << "Creating " << config_path.branch_path() << "..." << std::endl;
+ fs::create_directories(config_path.branch_path());
+ }
+
+ std::ofstream config;
+ if(update_result) {
+ std::cout << "Updating " << config_path << "..." << std::endl;
+ config.open(config_path.string().c_str(), std::ofstream::app);
+ if (!config.is_open()) { //either we don't have write access or we don't have the dir yet
+ std::cout << "Error opening file " << config_path << std::endl;
+ }
+ }
+ else {
+ std::cout << "Writing " << config_path << "..." << std::endl;
+ config.open(config_path.string().c_str());
+ if (!config.is_open()) { //either we don't have write access or we don't have the dir yet
+ std::cout << "Error opening file " << config_path << std::endl;
+ }
+
+ config << "\
+#this file is generated by volk_profile.\n\
+#the function name is followed by the preferred architecture.\n\
+";
+ }
+
+ std::vector<volk_test_results_t>::const_iterator profile_results;
+ for(profile_results = results->begin(); profile_results != results->end(); ++profile_results) {
+ config << profile_results->config_name << " "
+ << profile_results->best_arch_a << " "
+ << profile_results->best_arch_u << std::endl;
+ }
+ config.close();
+}
+
+void write_json(std::ofstream &json_file, std::vector<volk_test_results_t> results)
+{
+ json_file << "{" << std::endl;
+ json_file << " \"volk_tests\": [" << std::endl;
+ size_t len = results.size();
+ size_t i = 0;
+ std::vector<volk_test_results_t>::iterator result;
+ for(result = results.begin(); result != results.end(); ++result) {
+ json_file << " {" << std::endl;
+ json_file << " \"name\": \"" << result->name << "\"," << std::endl;
+ json_file << " \"vlen\": " << (int)(result->vlen) << "," << std::endl;
+ json_file << " \"iter\": " << result->iter << "," << std::endl;
+ json_file << " \"best_arch_a\": \"" << result->best_arch_a
+ << "\"," << std::endl;
+ json_file << " \"best_arch_u\": \"" << result->best_arch_u
+ << "\"," << std::endl;
+ json_file << " \"results\": {" << std::endl;
+ size_t results_len = result->results.size();
+ size_t ri = 0;
+
+ std::map<std::string, volk_test_time_t>::iterator kernel_time_pair;
+ for(kernel_time_pair = result->results.begin(); kernel_time_pair != result->results.end(); ++kernel_time_pair) {
+ volk_test_time_t time = kernel_time_pair->second;
+ json_file << " \"" << time.name << "\": {" << std::endl;
+ json_file << " \"name\": \"" << time.name << "\"," << std::endl;
+ json_file << " \"time\": " << time.time << "," << std::endl;
+ json_file << " \"units\": \"" << time.units << "\"" << std::endl;
+ json_file << " }" ;
+ if(ri+1 != results_len) {
+ json_file << ",";
+ }
+ json_file << std::endl;
+ ri++;
+ }
+ json_file << " }" << std::endl;
+ json_file << " }";
+ if(i+1 != len) {
+ json_file << ",";
+ }
+ json_file << std::endl;
+ i++;
+ }
+ json_file << " ]" << std::endl;
+ json_file << "}" << std::endl;
+}
+
+
--- /dev/null
+
+
+void read_results(std::vector<volk_test_results_t> *results);
+void read_results(std::vector<volk_test_results_t> *results, std::string path);
+void write_results(const std::vector<volk_test_results_t> *results, bool update_result);
+void write_results(const std::vector<volk_test_results_t> *results, bool update_result, const std::string path);
+void write_json(std::ofstream &json_file, std::vector<volk_test_results_t> results);
--- /dev/null
+clone_depth: 1
+
+os: Visual Studio 2013
+
+install:
+ - echo "Installing Boost libraries..."
+ - nuget install boost_system-vc120
+ - nuget install boost_filesystem-vc120
+ - nuget install boost_chrono-vc120
+ - nuget install boost_program_options-vc120
+ - nuget install boost_unit_test_framework-vc120
+
+ - echo "Installing Cheetah templates..."
+ - appveyor DownloadFile https://pypi.python.org/packages/source/C/Cheetah/Cheetah-2.4.4.tar.gz
+ - 7z x Cheetah-2.4.4.tar.gz
+ - 7z x -y Cheetah-2.4.4.tar
+ - cd Cheetah-2.4.4
+ - c:\Python27\python.exe setup.py build
+ - c:\Python27\python.exe setup.py install
+
+build_script:
+ - cd c:\projects\volk
+
+ # Without this directory in the %PATH%, compiler tests fail because of missing DLLs
+ - set PATH=%PATH%;C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin
+
+ - cmake -G "Visual Studio 12 Win64" \
+ -DBoost_CHRONO_LIBRARY_RELEASE:FILEPATH=c:/projects/volk/boost_chrono-vc120.1.59.0.0/lib/native/address-model-64/lib/boost_chrono-vc120-mt-1_59.lib \
+ -DBoost_FILESYSTEM_LIBRARY_RELEASE:FILEPATH=c:/projects/volk/boost_filesystem-vc120.1.59.0.0/lib/native/address-model-64/lib/boost_filesystem-vc120-mt-1_59.lib \
+ -DBoost_PROGRAM_OPTIONS_LIBRARY_RELEASE:FILEPATH=c:/projects/volk/boost_program_options-vc120.1.59.0.0/lib/native/address-model-64/lib/boost_program_options-vc120-mt-1_59.lib \
+ -DBoost_SYSTEM_LIBRARY_RELEASE:FILEPATH=c:/projects/volk/boost_system-vc120.1.59.0.0/lib/native/address-model-64/lib/boost_system-vc120-mt-1_59.lib \
+ -DBoost_UNIT_TEST_FRAMEWORK_LIBRARY_RELEASE:FILEPATH=c:/projects/volk/boost_unit_test_framework-vc120.1.59.0.0/lib/native/address-model-64/lib/boost_unit_test_framework-vc120-mt-1_59.lib \
+ -DBoost_INCLUDE_DIR:PATH=c:/projects/volk/boost.1.59.0.0/lib/native/include \
+ -DCMAKE_BUILD_TYPE:STRING=Release -DENABLE_ORC:BOOL=OFF -DENABLE_TESTING:BOOL=OFF \
+ .
+
+ - cmake --build . --config Release --target INSTALL
+
+ # Create an archive
+ - cd "c:\Program Files"
+ - 7z a "c:\libvolk-x64.zip" volk
+
+ # Create the deps archive
+ - mkdir dlls
+ - copy c:\projects\volk\boost_chrono-vc120.1.59.0.0\lib\native\address-model-64\lib\boost_chrono-vc120-mt-1_59.dll dlls\boost_chrono-vc120-mt-1_59.dll
+ - copy c:\projects\volk\boost_filesystem-vc120.1.59.0.0\lib\native\address-model-64\lib\boost_filesystem-vc120-mt-1_59.dll dlls\boost_filesystem-vc120-mt-1_59.dll
+ - copy c:\projects\volk\boost_program_options-vc120.1.59.0.0\lib\native\address-model-64\lib\boost_program_options-vc120-mt-1_59.dll dlls\boost_program_options-vc120-mt-1_59.dll
+ - copy c:\projects\volk\boost_system-vc120.1.59.0.0\lib\native\address-model-64\lib\boost_system-vc120-mt-1_59.dll dlls\boost_system-vc120-mt-1_59.dll
+ - copy c:\projects\volk\boost_unit_test_framework-vc120.1.59.0.0\lib\native\address-model-64\lib\boost_unit_test_framework-vc120-mt-1_59.dll dlls\boost_unit_test_framework-vc120-mt-1_59.dll
+ - cd dlls
+ - 7z a "c:\libvolk-x64-deps.zip" *
+
+ # Push it!
+ - appveyor PushArtifact c:\libvolk-x64.zip
+ - appveyor PushArtifact c:\libvolk-x64-deps.zip
--- /dev/null
+# CMAKE_PARSE_ARGUMENTS(<prefix> <options> <one_value_keywords> <multi_value_keywords> args...)
+#
+# CMAKE_PARSE_ARGUMENTS() is intended to be used in macros or functions for
+# parsing the arguments given to that macro or function.
+# It processes the arguments and defines a set of variables which hold the
+# values of the respective options.
+#
+# The <options> argument contains all options for the respective macro,
+# i.e. keywords which can be used when calling the macro without any value
+# following, like e.g. the OPTIONAL keyword of the install() command.
+#
+# The <one_value_keywords> argument contains all keywords for this macro
+# which are followed by one value, like e.g. DESTINATION keyword of the
+# install() command.
+#
+# The <multi_value_keywords> argument contains all keywords for this macro
+# which can be followed by more than one value, like e.g. the TARGETS or
+# FILES keywords of the install() command.
+#
+# When done, CMAKE_PARSE_ARGUMENTS() will have defined for each of the
+# keywords listed in <options>, <one_value_keywords> and
+# <multi_value_keywords> a variable composed of the given <prefix>
+# followed by "_" and the name of the respective keyword.
+# These variables will then hold the respective value from the argument list.
+# For the <options> keywords this will be TRUE or FALSE.
+#
+# All remaining arguments are collected in a variable
+# <prefix>_UNPARSED_ARGUMENTS, this can be checked afterwards to see whether
+# your macro was called with unrecognized parameters.
+#
+# As an example here a my_install() macro, which takes similar arguments as the
+# real install() command:
+#
+# function(MY_INSTALL)
+# set(options OPTIONAL FAST)
+# set(oneValueArgs DESTINATION RENAME)
+# set(multiValueArgs TARGETS CONFIGURATIONS)
+# cmake_parse_arguments(MY_INSTALL "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN} )
+# ...
+#
+# Assume my_install() has been called like this:
+# my_install(TARGETS foo bar DESTINATION bin OPTIONAL blub)
+#
+# After the cmake_parse_arguments() call the macro will have set the following
+# variables:
+# MY_INSTALL_OPTIONAL = TRUE
+# MY_INSTALL_FAST = FALSE (this option was not used when calling my_install()
+# MY_INSTALL_DESTINATION = "bin"
+# MY_INSTALL_RENAME = "" (was not used)
+# MY_INSTALL_TARGETS = "foo;bar"
+# MY_INSTALL_CONFIGURATIONS = "" (was not used)
+# MY_INSTALL_UNPARSED_ARGUMENTS = "blub" (no value expected after "OPTIONAL"
+#
+# You can the continue and process these variables.
+#
+# Keywords terminate lists of values, e.g. if directly after a one_value_keyword
+# another recognized keyword follows, this is interpreted as the beginning of
+# the new option.
+# E.g. my_install(TARGETS foo DESTINATION OPTIONAL) would result in
+# MY_INSTALL_DESTINATION set to "OPTIONAL", but MY_INSTALL_DESTINATION would
+# be empty and MY_INSTALL_OPTIONAL would be set to TRUE therefor.
+
+#=============================================================================
+# Copyright 2010 Alexander Neundorf <neundorf@kde.org>
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+# License text for the above reference.)
+
+
+if(__CMAKE_PARSE_ARGUMENTS_INCLUDED)
+ return()
+endif()
+set(__CMAKE_PARSE_ARGUMENTS_INCLUDED TRUE)
+
+
+function(CMAKE_PARSE_ARGUMENTS prefix _optionNames _singleArgNames _multiArgNames)
+ # first set all result variables to empty/FALSE
+ foreach(arg_name ${_singleArgNames} ${_multiArgNames})
+ set(${prefix}_${arg_name})
+ endforeach(arg_name)
+
+ foreach(option ${_optionNames})
+ set(${prefix}_${option} FALSE)
+ endforeach(option)
+
+ set(${prefix}_UNPARSED_ARGUMENTS)
+
+ set(insideValues FALSE)
+ set(currentArgName)
+
+ # now iterate over all arguments and fill the result variables
+ foreach(currentArg ${ARGN})
+ list(FIND _optionNames "${currentArg}" optionIndex) # ... then this marks the end of the arguments belonging to this keyword
+ list(FIND _singleArgNames "${currentArg}" singleArgIndex) # ... then this marks the end of the arguments belonging to this keyword
+ list(FIND _multiArgNames "${currentArg}" multiArgIndex) # ... then this marks the end of the arguments belonging to this keyword
+
+ if(${optionIndex} EQUAL -1 AND ${singleArgIndex} EQUAL -1 AND ${multiArgIndex} EQUAL -1)
+ if(insideValues)
+ if("${insideValues}" STREQUAL "SINGLE")
+ set(${prefix}_${currentArgName} ${currentArg})
+ set(insideValues FALSE)
+ elseif("${insideValues}" STREQUAL "MULTI")
+ list(APPEND ${prefix}_${currentArgName} ${currentArg})
+ endif()
+ else(insideValues)
+ list(APPEND ${prefix}_UNPARSED_ARGUMENTS ${currentArg})
+ endif(insideValues)
+ else()
+ if(NOT ${optionIndex} EQUAL -1)
+ set(${prefix}_${currentArg} TRUE)
+ set(insideValues FALSE)
+ elseif(NOT ${singleArgIndex} EQUAL -1)
+ set(currentArgName ${currentArg})
+ set(${prefix}_${currentArgName})
+ set(insideValues "SINGLE")
+ elseif(NOT ${multiArgIndex} EQUAL -1)
+ set(currentArgName ${currentArg})
+ set(${prefix}_${currentArgName})
+ set(insideValues "MULTI")
+ endif()
+ endif()
+
+ endforeach(currentArg)
+
+ # propagate the result variables to the caller:
+ foreach(arg_name ${_singleArgNames} ${_multiArgNames} ${_optionNames})
+ set(${prefix}_${arg_name} ${${prefix}_${arg_name}} PARENT_SCOPE)
+ endforeach(arg_name)
+ set(${prefix}_UNPARSED_ARGUMENTS ${${prefix}_UNPARSED_ARGUMENTS} PARENT_SCOPE)
+
+endfunction(CMAKE_PARSE_ARGUMENTS _options _singleArgs _multiArgs)
--- /dev/null
+FIND_PACKAGE(PkgConfig)
+PKG_CHECK_MODULES(PC_ORC "orc-0.4 > 0.4.11")
+
+
+
+
+FIND_PROGRAM(ORCC_EXECUTABLE orcc
+ HINTS ${PC_ORC_TOOLSDIR}
+ PATHS ${ORC_ROOT}/bin ${CMAKE_INSTALL_PREFIX}/bin)
+
+FIND_PATH(ORC_INCLUDE_DIR NAMES orc/orc.h
+ HINTS ${PC_ORC_INCLUDEDIR}
+ PATHS ${ORC_ROOT}/include/orc-0.4 ${CMAKE_INSTALL_PREFIX}/include/orc-0.4)
+
+
+FIND_PATH(ORC_LIBRARY_DIR NAMES ${CMAKE_SHARED_LIBRARY_PREFIX}orc-0.4${CMAKE_SHARED_LIBRARY_SUFFIX}
+ HINTS ${PC_ORC_LIBDIR}
+ PATHS ${ORC_ROOT}/lib${LIB_SUFFIX} ${CMAKE_INSTALL_PREFIX}/lib${LIB_SUFFIX})
+
+FIND_LIBRARY(ORC_LIB orc-0.4
+ HINTS ${PC_ORC_LIBRARY_DIRS}
+ PATHS ${ORC_ROOT}/lib${LIB_SUFFIX} ${CMAKE_INSTALL_PREFIX}/lib${LIB_SUFFIX})
+
+LIST(APPEND ORC_LIBRARY
+ ${ORC_LIB}
+)
+
+
+SET(ORC_INCLUDE_DIRS ${ORC_INCLUDE_DIR})
+SET(ORC_LIBRARIES ${ORC_LIBRARY})
+SET(ORC_LIBRARY_DIRS ${ORC_LIBRARY_DIR})
+
+INCLUDE(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(ORC "orc files" ORC_LIBRARY ORC_INCLUDE_DIR ORCC_EXECUTABLE)
+
+mark_as_advanced(ORC_INCLUDE_DIR ORC_LIBRARY ORCC_EXECUTABLE)
--- /dev/null
+# Copyright 2015 Free Software Foundation, Inc.
+#
+# This file is part of Volk
+#
+# Volk is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# Volk is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
+# License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Volk; see the file COPYING. If not, write to the Free
+# Software Foundation, Inc., 51 Franklin Street, Boston, MA
+# 02110-1301, USA.
+
+if(DEFINED __INCLUDED_VOLK_ADD_TEST)
+ return()
+endif()
+set(__INCLUDED_VOLK_ADD_TEST TRUE)
+
+########################################################################
+# Add a unit test and setup the environment for it.
+# Encloses ADD_TEST, with additional functionality to create a shell
+# script that sets the environment to gain access to in-build binaries
+# properly. The following variables are used to pass in settings:
+#
+# NAME - the test name
+# SOURCES - sources for the test
+# TARGET_DEPS - build target dependencies (e.g., libraries)
+# EXTRA_LIB_DIRS - other directories for the library path
+# ENVIRONS - other environment key/value pairs
+# ARGS - arguments for the test
+########################################################################
+function(VOLK_ADD_TEST test_name)
+
+ #parse the arguments for component names
+ include(CMakeParseArgumentsCopy)
+ CMAKE_PARSE_ARGUMENTS(VOLK_TEST "" "" "SOURCES;TARGET_DEPS;EXTRA_LIB_DIRS;ENVIRONS;ARGS" ${ARGN})
+
+ #set the initial environs to use
+ set(environs ${VOLK_TEST_ENVIRONS})
+
+ #create the initial library path
+ file(TO_NATIVE_PATH "${VOLK_TEST_EXTRA_LIB_DIRS}" libpath)
+
+ #set the source directory, which is mostly FYI
+ file(TO_NATIVE_PATH ${CMAKE_CURRENT_SOURCE_DIR} srcdir)
+ list(APPEND environs "srcdir=\"${srcdir}\"")
+
+ #http://www.cmake.org/pipermail/cmake/2009-May/029464.html
+ #Replaced this add test + set environs code with the shell script generation.
+ #Its nicer to be able to manually run the shell script to diagnose problems.
+ if(UNIX)
+ if(APPLE)
+ set(LD_PATH_VAR "DYLD_LIBRARY_PATH")
+ else()
+ set(LD_PATH_VAR "LD_LIBRARY_PATH")
+ endif()
+
+ #create a list of target directories to be determined by the
+ #"add_test" command, via the $<FOO:BAR> operator; make sure the
+ #test's directory is first, since it ($1) is prepended to PATH.
+ unset(TARGET_DIR_LIST)
+ foreach(target ${test_name} ${VOLK_TEST_TARGET_DEPS})
+ list(APPEND TARGET_DIR_LIST "\$<TARGET_FILE_DIR:${target}>")
+ endforeach()
+
+ #augment the PATH to start with the directory of the test
+ set(binpath "\"$1:\$PATH\"")
+ list(APPEND environs "PATH=${binpath}")
+
+ #set the shell to use
+ if(CMAKE_CROSSCOMPILING)
+ set(SHELL "/bin/sh")
+ else()
+ find_program(SHELL sh)
+ endif()
+
+ #check to see if the shell supports "$*" expansion with IFS
+ if(NOT TESTED_SHELL_SUPPORTS_IFS)
+ set(TESTED_SHELL_SUPPORTS_IFS TRUE CACHE BOOL "")
+ set(sh_file ${CMAKE_CURRENT_BINARY_DIR}/ifs_test.sh)
+ file(WRITE ${sh_file} "#!${SHELL}\n")
+ file(APPEND ${sh_file} "export IFS=:\n")
+ file(APPEND ${sh_file} "echo \"$*\"\n")
+ #make the shell file executable
+ execute_process(COMMAND chmod +x ${sh_file})
+
+ #execute the shell script
+ execute_process(COMMAND ${sh_file} "a" "b" "c"
+ OUTPUT_VARIABLE output OUTPUT_STRIP_TRAILING_WHITESPACE
+ )
+
+ #check the output to see if it is correct
+ string(COMPARE EQUAL ${output} "a:b:c" SHELL_SUPPORTS_IFS)
+ set(SHELL_SUPPORTS_IFS ${SHELL_SUPPORTS_IFS} CACHE BOOL
+ "Set this value to TRUE if the shell supports IFS argument expansion"
+ )
+ endif()
+ unset(testlibpath)
+ if(SHELL_SUPPORTS_IFS)
+ #"$*" expands in the shell into a list of all of the arguments
+ #to the shell script, concatenated using the character provided
+ #in ${IFS}.
+ list(APPEND testlibpath "$*")
+ else()
+ #shell does not support IFS expansion; use a loop instead
+ list(APPEND testlibpath "\${LL}")
+ endif()
+
+ #finally: add in the current library path variable
+ list(INSERT libpath 0 ${testlibpath})
+ list(APPEND libpath "$${LD_PATH_VAR}")
+
+ #replace list separator with the path separator
+ string(REPLACE ";" ":" libpath "${libpath}")
+ list(APPEND environs "${LD_PATH_VAR}=\"${libpath}\"")
+
+ #generate a shell script file that sets the environment and runs the test
+ set(sh_file ${CMAKE_CURRENT_BINARY_DIR}/${test_name}_test.sh)
+ file(WRITE ${sh_file} "#!${SHELL}\n")
+ if(SHELL_SUPPORTS_IFS)
+ file(APPEND ${sh_file} "export IFS=:\n")
+ else()
+ file(APPEND ${sh_file} "LL=\"$1\" && for tf in \"\$@\"; do LL=\"\${LL}:\${tf}\"; done\n")
+ endif()
+
+ #each line sets an environment variable
+ foreach(environ ${environs})
+ file(APPEND ${sh_file} "export ${environ}\n")
+ endforeach(environ)
+
+ #redo the test args to have a space between each
+ string(REPLACE ";" " " VOLK_TEST_ARGS "${VOLK_TEST_ARGS}")
+
+ #finally: append the test name to execute
+ file(APPEND ${sh_file} ${test_name} " " ${VOLK_TEST_ARGS} "\n")
+
+ #make the shell file executable
+ execute_process(COMMAND chmod +x ${sh_file})
+
+ add_executable(${test_name} ${VOLK_TEST_SOURCES})
+ target_link_libraries(${test_name} ${VOLK_TEST_TARGET_DEPS})
+
+ #add the shell file as the test to execute;
+ #use the form that allows for $<FOO:BAR> substitutions,
+ #then combine the script arguments inside the script.
+ add_test(NAME qa_${test_name}
+ COMMAND ${SHELL} ${sh_file} ${TARGET_DIR_LIST}
+ )
+
+ endif(UNIX)
+
+ if(WIN32)
+ #In the land of windows, all libraries must be in the PATH. Since
+ #the dependent libraries are not yet installed, we must manually
+ #set them in the PATH to run tests. The following appends the
+ #path of a target dependency.
+ #
+ #NOTE: get_target_property LOCATION is being deprecated as of
+ #CMake 3.2.0, which just prints a warning & notes that this
+ #functionality will be removed in the future. Leave it here for
+ #now until someone can figure out how to do this in Windows.
+ foreach(target ${test_name} ${VOLK_TEST_TARGET_DEPS})
+ get_target_property(location "${target}" LOCATION)
+ if(location)
+ get_filename_component(path ${location} PATH)
+ string(REGEX REPLACE "\\$\\(.*\\)" ${CMAKE_BUILD_TYPE} path ${path})
+ list(APPEND libpath ${path})
+ endif(location)
+ endforeach(target)
+
+ list(APPEND libpath ${DLL_PATHS} "%PATH%")
+
+ #replace list separator with the path separator (escaped)
+ string(REPLACE ";" "\\;" libpath "${libpath}")
+ list(APPEND environs "PATH=${libpath}")
+
+ #generate a bat file that sets the environment and runs the test
+ set(bat_file ${CMAKE_CURRENT_BINARY_DIR}/${test_name}_test.bat)
+ file(WRITE ${bat_file} "@echo off\n")
+
+ #each line sets an environment variable
+ foreach(environ ${environs})
+ file(APPEND ${bat_file} "SET ${environ}\n")
+ endforeach(environ)
+
+ #redo the test args to have a space between each
+ string(REPLACE ";" " " VOLK_TEST_ARGS "${VOLK_TEST_ARGS}")
+
+ #finally: append the test name to execute
+ file(APPEND ${bat_file} ${test_name} " " ${VOLK_TEST_ARGS} "\n")
+ file(APPEND ${bat_file} "\n")
+
+ add_executable(${test_name} ${VOLK_TEST_SOURCES})
+ target_link_libraries(${test_name} ${VOLK_TEST_TARGET_DEPS})
+
+ add_test(${test_name} ${bat_file})
+ endif(WIN32)
+
+endfunction(VOLK_ADD_TEST)
--- /dev/null
+# Copyright 2010-2011 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING. If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+
+if(DEFINED __INCLUDED_VOLK_BOOST_CMAKE)
+ return()
+endif()
+set(__INCLUDED_VOLK_BOOST_CMAKE TRUE)
+
+########################################################################
+# Setup Boost and handle some system specific things
+########################################################################
+
+set(BOOST_REQUIRED_COMPONENTS
+ filesystem
+ system
+ unit_test_framework
+ program_options
+)
+
+if(UNIX AND NOT BOOST_ROOT AND EXISTS "/usr/lib64")
+ list(APPEND BOOST_LIBRARYDIR "/usr/lib64") #fedora 64-bit fix
+endif(UNIX AND NOT BOOST_ROOT AND EXISTS "/usr/lib64")
+
+if(MSVC)
+ set(BOOST_REQUIRED_COMPONENTS ${BOOST_REQUIRED_COMPONENTS} chrono)
+
+ if (NOT DEFINED BOOST_ALL_DYN_LINK)
+ set(BOOST_ALL_DYN_LINK TRUE)
+ endif()
+ set(BOOST_ALL_DYN_LINK "${BOOST_ALL_DYN_LINK}" CACHE BOOL "boost enable dynamic linking")
+ if(BOOST_ALL_DYN_LINK)
+ add_definitions(-DBOOST_ALL_DYN_LINK) #setup boost auto-linking in msvc
+ else(BOOST_ALL_DYN_LINK)
+ unset(BOOST_REQUIRED_COMPONENTS) #empty components list for static link
+ endif(BOOST_ALL_DYN_LINK)
+endif(MSVC)
+
+find_package(Boost "1.35" COMPONENTS ${BOOST_REQUIRED_COMPONENTS})
+
+# This does not allow us to disable specific versions. It is used
+# internally by cmake to know the formation newer versions. As newer
+# Boost version beyond what is shown here are produced, we must extend
+# this list. To disable Boost versions, see below.
+set(Boost_ADDITIONAL_VERSIONS
+ "1.35.0" "1.35" "1.36.0" "1.36" "1.37.0" "1.37" "1.38.0" "1.38" "1.39.0" "1.39"
+ "1.40.0" "1.40" "1.41.0" "1.41" "1.42.0" "1.42" "1.43.0" "1.43" "1.44.0" "1.44"
+ "1.45.0" "1.45" "1.46.0" "1.46" "1.47.0" "1.47" "1.48.0" "1.48" "1.49.0" "1.49"
+ "1.50.0" "1.50" "1.51.0" "1.51" "1.52.0" "1.52" "1.53.0" "1.53" "1.54.0" "1.54"
+ "1.55.0" "1.55" "1.56.0" "1.56" "1.57.0" "1.57" "1.58.0" "1.58" "1.59.0" "1.59"
+ "1.60.0" "1.60" "1.61.0" "1.61" "1.62.0" "1.62" "1.63.0" "1.63" "1.64.0" "1.64"
+ "1.65.0" "1.65" "1.66.0" "1.66" "1.67.0" "1.67" "1.68.0" "1.68" "1.69.0" "1.69"
+)
+
+# Boost 1.52 disabled, see https://svn.boost.org/trac/boost/ticket/7669
+# Similar problems with Boost 1.46 and 1.47.
+
+OPTION(ENABLE_BAD_BOOST "Enable known bad versions of Boost" OFF)
+if(ENABLE_BAD_BOOST)
+ MESSAGE(STATUS "Enabling use of known bad versions of Boost.")
+endif(ENABLE_BAD_BOOST)
+
+# For any unsuitable Boost version, add the version number below in
+# the following format: XXYYZZ
+# Where:
+# XX is the major version ('10' for version 1)
+# YY is the minor version number ('46' for 1.46)
+# ZZ is the patcher version number (typically just '00')
+set(Boost_NOGO_VERSIONS
+ 104600 104601 104700 105200
+ )
+
+foreach(ver ${Boost_NOGO_VERSIONS})
+ if("${Boost_VERSION}" STREQUAL "${ver}")
+ if(NOT ENABLE_BAD_BOOST)
+ MESSAGE(STATUS "WARNING: Found a known bad version of Boost (v${Boost_VERSION}). Disabling.")
+ set(Boost_FOUND FALSE)
+ else(NOT ENABLE_BAD_BOOST)
+ MESSAGE(STATUS "WARNING: Found a known bad version of Boost (v${Boost_VERSION}). Continuing anyway.")
+ set(Boost_FOUND TRUE)
+ endif(NOT ENABLE_BAD_BOOST)
+ endif("${Boost_VERSION}" STREQUAL "${ver}")
+endforeach(ver)
--- /dev/null
+# Copyright 2014 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# VOLK is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# VOLK is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING. If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+
+if(DEFINED __INCLUDED_VOLK_BUILD_TYPES_CMAKE)
+ return()
+endif()
+set(__INCLUDED_VOLK_BUILD_TYPES_CMAKE TRUE)
+
+# Standard CMake Build Types and their basic CFLAGS:
+# - None: nothing set
+# - Debug: -O2 -g
+# - Release: -O3
+# - RelWithDebInfo: -O3 -g
+# - MinSizeRel: -Os
+
+# Addtional Build Types, defined below:
+# - NoOptWithASM: -O0 -g -save-temps
+# - O2WithASM: -O2 -g -save-temps
+# - O3WithASM: -O3 -g -save-temps
+# - DebugParanoid -O0 -g -Werror
+
+# Defines the list of acceptable cmake build types. When adding a new
+# build type below, make sure to add it to this list.
+list(APPEND AVAIL_BUILDTYPES
+ None Debug Release RelWithDebInfo MinSizeRel
+ DebugParanoid NoOptWithASM O2WithASM O3WithASM
+)
+
+########################################################################
+# VOLK_CHECK_BUILD_TYPE(build type)
+#
+# Use this to check that the build type set in CMAKE_BUILD_TYPE on the
+# commandline is one of the valid build types used by this project. It
+# checks the value set in the cmake interface against the list of
+# known build types in AVAIL_BUILDTYPES. If the build type is found,
+# the function exits immediately. If nothing is found by the end of
+# checking all available build types, we exit with an error and list
+# the avialable build types.
+########################################################################
+function(VOLK_CHECK_BUILD_TYPE settype)
+ STRING(TOUPPER ${settype} _settype)
+ foreach(btype ${AVAIL_BUILDTYPES})
+ STRING(TOUPPER ${btype} _btype)
+ if(${_settype} STREQUAL ${_btype})
+ return() # found it; exit cleanly
+ endif(${_settype} STREQUAL ${_btype})
+ endforeach(btype)
+ # Build type not found; error out
+ message(FATAL_ERROR "Build type '${settype}' not valid, must be one of: ${AVAIL_BUILDTYPES}")
+endfunction(VOLK_CHECK_BUILD_TYPE)
+
+########################################################################
+# For GCC and Clang, we can set a build type:
+#
+# -DCMAKE_BUILD_TYPE=DebugParanoid
+#
+# This type uses no optimization (-O0), outputs debug symbols (-g), warns
+# on everything, and stops on warnings.
+# NOTE: This is not defined on Windows systems.
+########################################################################
+if(NOT WIN32)
+ SET(CMAKE_CXX_FLAGS_DEBUGPARANOID "-Wall -Wextra -g -O0" CACHE STRING
+ "Flags used by the C++ compiler during DebugParanoid builds." FORCE)
+ SET(CMAKE_C_FLAGS_DEBUGPARANOID "-Wall -Wextra -g -O0" CACHE STRING
+ "Flags used by the C compiler during DebugParanoid builds." FORCE)
+ SET(CMAKE_EXE_LINKER_FLAGS_DEBUGPARANOID
+ "-Wl,--warn-unresolved-symbols,--warn-once" CACHE STRING
+ "Flags used for linking binaries during NoOptWithASM builds." FORCE)
+ SET(CMAKE_SHARED_LINKER_FLAGS_DEBUGPARANOID
+ "-Wl,--warn-unresolved-symbols,--warn-once" CACHE STRING
+ "Flags used by the shared lib linker during NoOptWithASM builds." FORCE)
+
+ MARK_AS_ADVANCED(
+ CMAKE_CXX_FLAGS_DEBUGPARANOID
+ CMAKE_C_FLAGS_DEBUGPARANOID
+ CMAKE_EXE_LINKER_FLAGS_DEBUGPARANOID
+ CMAKE_SHARED_LINKER_DEBUGPARANOID)
+endif(NOT WIN32)
+
+
+########################################################################
+# For GCC and Clang, we can set a build type:
+#
+# -DCMAKE_BUILD_TYPE=NoOptWithASM
+#
+# This type uses no optimization (-O0), outputs debug symbols (-g) and
+# outputs all intermediary files the build system produces, including
+# all assembly (.s) files. Look in the build directory for these
+# files.
+# NOTE: This is not defined on Windows systems.
+########################################################################
+if(NOT WIN32)
+ SET(CMAKE_CXX_FLAGS_NOOPTWITHASM "-save-temps -g -O0" CACHE STRING
+ "Flags used by the C++ compiler during NoOptWithASM builds." FORCE)
+ SET(CMAKE_C_FLAGS_NOOPTWITHASM "-save-temps -g -O0" CACHE STRING
+ "Flags used by the C compiler during NoOptWithASM builds." FORCE)
+ SET(CMAKE_EXE_LINKER_FLAGS_NOOPTWITHASM
+ "-Wl,--warn-unresolved-symbols,--warn-once" CACHE STRING
+ "Flags used for linking binaries during NoOptWithASM builds." FORCE)
+ SET(CMAKE_SHARED_LINKER_FLAGS_NOOPTWITHASM
+ "-Wl,--warn-unresolved-symbols,--warn-once" CACHE STRING
+ "Flags used by the shared lib linker during NoOptWithASM builds." FORCE)
+
+ MARK_AS_ADVANCED(
+ CMAKE_CXX_FLAGS_NOOPTWITHASM
+ CMAKE_C_FLAGS_NOOPTWITHASM
+ CMAKE_EXE_LINKER_FLAGS_NOOPTWITHASM
+ CMAKE_SHARED_LINKER_FLAGS_NOOPTWITHASM)
+endif(NOT WIN32)
+
+
+########################################################################
+# For GCC and Clang, we can set a build type:
+#
+# -DCMAKE_BUILD_TYPE=O2WithASM
+#
+# This type uses level 2 optimization (-O2), outputs debug symbols
+# (-g) and outputs all intermediary files the build system produces,
+# including all assembly (.s) files. Look in the build directory for
+# these files.
+# NOTE: This is not defined on Windows systems.
+########################################################################
+
+if(NOT WIN32)
+ SET(CMAKE_CXX_FLAGS_O2WITHASM "-save-temps -g -O2" CACHE STRING
+ "Flags used by the C++ compiler during O2WithASM builds." FORCE)
+ SET(CMAKE_C_FLAGS_O2WITHASM "-save-temps -g -O2" CACHE STRING
+ "Flags used by the C compiler during O2WithASM builds." FORCE)
+ SET(CMAKE_EXE_LINKER_FLAGS_O2WITHASM
+ "-Wl,--warn-unresolved-symbols,--warn-once" CACHE STRING
+ "Flags used for linking binaries during O2WithASM builds." FORCE)
+ SET(CMAKE_SHARED_LINKER_FLAGS_O2WITHASM
+ "-Wl,--warn-unresolved-symbols,--warn-once" CACHE STRING
+ "Flags used by the shared lib linker during O2WithASM builds." FORCE)
+
+ MARK_AS_ADVANCED(
+ CMAKE_CXX_FLAGS_O2WITHASM
+ CMAKE_C_FLAGS_O2WITHASM
+ CMAKE_EXE_LINKER_FLAGS_O2WITHASM
+ CMAKE_SHARED_LINKER_FLAGS_O2WITHASM)
+endif(NOT WIN32)
+
+
+########################################################################
+# For GCC and Clang, we can set a build type:
+#
+# -DCMAKE_BUILD_TYPE=O3WithASM
+#
+# This type uses level 3 optimization (-O3), outputs debug symbols
+# (-g) and outputs all intermediary files the build system produces,
+# including all assembly (.s) files. Look in the build directory for
+# these files.
+# NOTE: This is not defined on Windows systems.
+########################################################################
+
+if(NOT WIN32)
+ SET(CMAKE_CXX_FLAGS_O3WITHASM "-save-temps -g -O3" CACHE STRING
+ "Flags used by the C++ compiler during O3WithASM builds." FORCE)
+ SET(CMAKE_C_FLAGS_O3WITHASM "-save-temps -g -O3" CACHE STRING
+ "Flags used by the C compiler during O3WithASM builds." FORCE)
+ SET(CMAKE_EXE_LINKER_FLAGS_O3WITHASM
+ "-Wl,--warn-unresolved-symbols,--warn-once" CACHE STRING
+ "Flags used for linking binaries during O3WithASM builds." FORCE)
+ SET(CMAKE_SHARED_LINKER_FLAGS_O3WITHASM
+ "-Wl,--warn-unresolved-symbols,--warn-once" CACHE STRING
+ "Flags used by the shared lib linker during O3WithASM builds." FORCE)
+
+ MARK_AS_ADVANCED(
+ CMAKE_CXX_FLAGS_O3WITHASM
+ CMAKE_C_FLAGS_O3WITHASM
+ CMAKE_EXE_LINKER_FLAGS_O3WITHASM
+ CMAKE_SHARED_LINKER_FLAGS_O3WITHASM)
+endif(NOT WIN32)
--- /dev/null
+INCLUDE(FindPkgConfig)
+PKG_CHECK_MODULES(PC_VOLK volk)
+
+FIND_PATH(
+ VOLK_INCLUDE_DIRS
+ NAMES volk/volk.h
+ HINTS $ENV{VOLK_DIR}/include
+ ${PC_VOLK_INCLUDEDIR}
+ PATHS /usr/local/include
+ /usr/include
+ "@CMAKE_INSTALL_PREFIX@/include"
+)
+
+FIND_LIBRARY(
+ VOLK_LIBRARIES
+ NAMES volk
+ HINTS $ENV{VOLK_DIR}/lib
+ ${PC_VOLK_LIBDIR}
+ PATHS /usr/local/lib
+ /usr/local/lib64
+ /usr/lib
+ /usr/lib64
+ "@CMAKE_INSTALL_PREFIX@/lib"
+)
+
+INCLUDE(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(VOLK DEFAULT_MSG VOLK_LIBRARIES VOLK_INCLUDE_DIRS)
+MARK_AS_ADVANCED(VOLK_LIBRARIES VOLK_INCLUDE_DIRS)
--- /dev/null
+# Copyright 2014 Free Software Foundation, Inc.
+#
+# This file is part of VOLK.
+#
+# VOLK is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# VOLK is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with VOLK; see the file COPYING. If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+
+set(MAJOR_VERSION @VERSION_INFO_MAJOR_VERSION@)
+set(MINOR_VERSION @VERSION_INFO_MINOR_VERSION@)
+set(MAINT_VERSION @VERSION_INFO_MAINT_VERSION@)
+
+set(PACKAGE_VERSION
+ ${MAJOR_VERSION}.${MINOR_VERSION}.${MAINT_VERSION})
+
+if(${PACKAGE_FIND_VERSION_MAJOR} EQUAL ${MAJOR_VERSION})
+ if(${PACKAGE_FIND_VERSION_MINOR} EQUAL ${MINOR_VERSION})
+ if(NOT ${PACKAGE_FIND_VERSION_PATCH} GREATER ${MAINT_VERSION})
+ set(PACKAGE_VERSION_EXACT 1) # exact match for API version
+ set(PACKAGE_VERSION_COMPATIBLE 1) # compat for minor/patch version
+ endif(NOT ${PACKAGE_FIND_VERSION_PATCH} GREATER ${MINOR_VERSION})
+ endif(${PACKAGE_FIND_VERSION_MINOR} EQUAL ${API_COMPAT})
+endif(${PACKAGE_FIND_VERSION_MAJOR} EQUAL ${MAJOR_VERSION})
--- /dev/null
+# Copyright 2010-2011,2013 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING. If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+
+if(DEFINED __INCLUDED_VOLK_PYTHON_CMAKE)
+ return()
+endif()
+set(__INCLUDED_VOLK_PYTHON_CMAKE TRUE)
+
+########################################################################
+# Setup the python interpreter:
+# This allows the user to specify a specific interpreter,
+# or finds the interpreter via the built-in cmake module.
+########################################################################
+#this allows the user to override PYTHON_EXECUTABLE
+if(PYTHON_EXECUTABLE)
+
+ set(PYTHONINTERP_FOUND TRUE)
+
+#otherwise if not set, try to automatically find it
+else(PYTHON_EXECUTABLE)
+
+ #use the built-in find script
+ find_package(PythonInterp 2)
+
+ #and if that fails use the find program routine
+ if(NOT PYTHONINTERP_FOUND)
+ find_program(PYTHON_EXECUTABLE NAMES python python2 python2.7 python2.6 python2.5)
+ if(PYTHON_EXECUTABLE)
+ set(PYTHONINTERP_FOUND TRUE)
+ endif(PYTHON_EXECUTABLE)
+ endif(NOT PYTHONINTERP_FOUND)
+
+endif(PYTHON_EXECUTABLE)
+
+#make the path to the executable appear in the cmake gui
+set(PYTHON_EXECUTABLE ${PYTHON_EXECUTABLE} CACHE FILEPATH "python interpreter")
+
+#make sure we can use -B with python (introduced in 2.6)
+if(PYTHON_EXECUTABLE)
+ execute_process(
+ COMMAND ${PYTHON_EXECUTABLE} -B -c ""
+ OUTPUT_QUIET ERROR_QUIET
+ RESULT_VARIABLE PYTHON_HAS_DASH_B_RESULT
+ )
+ if(PYTHON_HAS_DASH_B_RESULT EQUAL 0)
+ set(PYTHON_DASH_B "-B")
+ endif()
+endif(PYTHON_EXECUTABLE)
+
+########################################################################
+# Check for the existence of a python module:
+# - desc a string description of the check
+# - mod the name of the module to import
+# - cmd an additional command to run
+# - have the result variable to set
+########################################################################
+macro(VOLK_PYTHON_CHECK_MODULE desc mod cmd have)
+ message(STATUS "")
+ message(STATUS "Python checking for ${desc}")
+ execute_process(
+ COMMAND ${PYTHON_EXECUTABLE} -c "
+#########################################
+try: import ${mod}
+except:
+ try: ${mod}
+ except: exit(-1)
+try: assert ${cmd}
+except: exit(-1)
+#########################################"
+ RESULT_VARIABLE ${have}
+ )
+ if(${have} EQUAL 0)
+ message(STATUS "Python checking for ${desc} - found")
+ set(${have} TRUE)
+ else(${have} EQUAL 0)
+ message(STATUS "Python checking for ${desc} - not found")
+ set(${have} FALSE)
+ endif(${have} EQUAL 0)
+endmacro(VOLK_PYTHON_CHECK_MODULE)
+
+########################################################################
+# Sets the python installation directory VOLK_PYTHON_DIR
+########################################################################
+if(NOT DEFINED VOLK_PYTHON_DIR)
+execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "
+from distutils import sysconfig
+print sysconfig.get_python_lib(plat_specific=True, prefix='')
+" OUTPUT_VARIABLE VOLK_PYTHON_DIR OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+endif()
+file(TO_CMAKE_PATH ${VOLK_PYTHON_DIR} VOLK_PYTHON_DIR)
+
+########################################################################
+# Create an always-built target with a unique name
+# Usage: VOLK_UNIQUE_TARGET(<description> <dependencies list>)
+########################################################################
+function(VOLK_UNIQUE_TARGET desc)
+ file(RELATIVE_PATH reldir ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+ execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import re, hashlib
+unique = hashlib.md5('${reldir}${ARGN}').hexdigest()[:5]
+print(re.sub('\\W', '_', '${desc} ${reldir} ' + unique))"
+ OUTPUT_VARIABLE _target OUTPUT_STRIP_TRAILING_WHITESPACE)
+ add_custom_target(${_target} ALL DEPENDS ${ARGN})
+endfunction(VOLK_UNIQUE_TARGET)
+
+########################################################################
+# Install python sources (also builds and installs byte-compiled python)
+########################################################################
+function(VOLK_PYTHON_INSTALL)
+ include(CMakeParseArgumentsCopy)
+ CMAKE_PARSE_ARGUMENTS(VOLK_PYTHON_INSTALL "" "DESTINATION;COMPONENT" "FILES;PROGRAMS" ${ARGN})
+
+ ####################################################################
+ if(VOLK_PYTHON_INSTALL_FILES)
+ ####################################################################
+ install(${ARGN}) #installs regular python files
+
+ #create a list of all generated files
+ unset(pysrcfiles)
+ unset(pycfiles)
+ unset(pyofiles)
+ foreach(pyfile ${VOLK_PYTHON_INSTALL_FILES})
+ get_filename_component(pyfile ${pyfile} ABSOLUTE)
+ list(APPEND pysrcfiles ${pyfile})
+
+ #determine if this file is in the source or binary directory
+ file(RELATIVE_PATH source_rel_path ${CMAKE_CURRENT_SOURCE_DIR} ${pyfile})
+ string(LENGTH "${source_rel_path}" source_rel_path_len)
+ file(RELATIVE_PATH binary_rel_path ${CMAKE_CURRENT_BINARY_DIR} ${pyfile})
+ string(LENGTH "${binary_rel_path}" binary_rel_path_len)
+
+ #and set the generated path appropriately
+ if(${source_rel_path_len} GREATER ${binary_rel_path_len})
+ set(pygenfile ${CMAKE_CURRENT_BINARY_DIR}/${binary_rel_path})
+ else()
+ set(pygenfile ${CMAKE_CURRENT_BINARY_DIR}/${source_rel_path})
+ endif()
+ list(APPEND pycfiles ${pygenfile}c)
+ list(APPEND pyofiles ${pygenfile}o)
+
+ #ensure generation path exists
+ get_filename_component(pygen_path ${pygenfile} PATH)
+ file(MAKE_DIRECTORY ${pygen_path})
+
+ endforeach(pyfile)
+
+ #the command to generate the pyc files
+ add_custom_command(
+ DEPENDS ${pysrcfiles} OUTPUT ${pycfiles}
+ COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_BINARY_DIR}/python_compile_helper.py ${pysrcfiles} ${pycfiles}
+ )
+
+ #the command to generate the pyo files
+ add_custom_command(
+ DEPENDS ${pysrcfiles} OUTPUT ${pyofiles}
+ COMMAND ${PYTHON_EXECUTABLE} -O ${CMAKE_BINARY_DIR}/python_compile_helper.py ${pysrcfiles} ${pyofiles}
+ )
+
+ #create install rule and add generated files to target list
+ set(python_install_gen_targets ${pycfiles} ${pyofiles})
+ install(FILES ${python_install_gen_targets}
+ DESTINATION ${VOLK_PYTHON_INSTALL_DESTINATION}
+ COMPONENT ${VOLK_PYTHON_INSTALL_COMPONENT}
+ )
+
+
+ ####################################################################
+ elseif(VOLK_PYTHON_INSTALL_PROGRAMS)
+ ####################################################################
+ file(TO_NATIVE_PATH ${PYTHON_EXECUTABLE} pyexe_native)
+
+ if (CMAKE_CROSSCOMPILING)
+ set(pyexe_native "/usr/bin/env python")
+ endif()
+
+ foreach(pyfile ${VOLK_PYTHON_INSTALL_PROGRAMS})
+ get_filename_component(pyfile_name ${pyfile} NAME)
+ get_filename_component(pyfile ${pyfile} ABSOLUTE)
+ string(REPLACE "${CMAKE_SOURCE_DIR}" "${CMAKE_BINARY_DIR}" pyexefile "${pyfile}.exe")
+ list(APPEND python_install_gen_targets ${pyexefile})
+
+ get_filename_component(pyexefile_path ${pyexefile} PATH)
+ file(MAKE_DIRECTORY ${pyexefile_path})
+
+ add_custom_command(
+ OUTPUT ${pyexefile} DEPENDS ${pyfile}
+ COMMAND ${PYTHON_EXECUTABLE} -c
+ "open('${pyexefile}','w').write('\#!${pyexe_native}\\n'+open('${pyfile}').read())"
+ COMMENT "Shebangin ${pyfile_name}"
+ VERBATIM
+ )
+
+ #on windows, python files need an extension to execute
+ get_filename_component(pyfile_ext ${pyfile} EXT)
+ if(WIN32 AND NOT pyfile_ext)
+ set(pyfile_name "${pyfile_name}.py")
+ endif()
+
+ install(PROGRAMS ${pyexefile} RENAME ${pyfile_name}
+ DESTINATION ${VOLK_PYTHON_INSTALL_DESTINATION}
+ COMPONENT ${VOLK_PYTHON_INSTALL_COMPONENT}
+ )
+ endforeach(pyfile)
+
+ endif()
+
+ VOLK_UNIQUE_TARGET("pygen" ${python_install_gen_targets})
+
+endfunction(VOLK_PYTHON_INSTALL)
+
+########################################################################
+# Write the python helper script that generates byte code files
+########################################################################
+file(WRITE ${CMAKE_BINARY_DIR}/python_compile_helper.py "
+import sys, py_compile
+files = sys.argv[1:]
+srcs, gens = files[:len(files)/2], files[len(files)/2:]
+for src, gen in zip(srcs, gens):
+ py_compile.compile(file=src, cfile=gen, doraise=True)
+")
--- /dev/null
+# Copyright 2014 Free Software Foundation, Inc.
+#
+# This file is part of VOLK.
+#
+# VOLK is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# VOLK is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with VOLK; see the file COPYING. If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+
+if(DEFINED __INCLUDED_VOLK_VERSION_CMAKE)
+ return()
+endif()
+set(__INCLUDED_VOLK_VERSION_CMAKE TRUE)
+
+#eventually, replace version.sh and fill in the variables below
+set(MAJOR_VERSION ${VERSION_INFO_MAJOR_VERSION})
+set(MINOR_VERSION ${VERSION_INFO_MINOR_VERSION})
+set(MAINT_VERSION ${VERSION_INFO_MAINT_VERSION})
+
+########################################################################
+# Extract the version string from git describe.
+########################################################################
+find_package(Git)
+
+if(GIT_FOUND AND EXISTS ${CMAKE_SOURCE_DIR}/.git)
+ message(STATUS "Extracting version information from git describe...")
+ execute_process(
+ COMMAND ${GIT_EXECUTABLE} describe --always --abbrev=8 --long
+ OUTPUT_VARIABLE GIT_DESCRIBE OUTPUT_STRIP_TRAILING_WHITESPACE
+ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+ )
+else()
+ if(NOT VOLK_GIT_COUNT)
+ set(VOLK_GIT_COUNT "0")
+ endif()
+
+ if(NOT VOLK_GIT_HASH)
+ set(VOLK_GIT_HASH "unknown")
+ endif()
+
+ set(GIT_DESCRIBE "v${MAJOR_VERSION}.${MINOR_VERSION}-${VOLK_GIT_COUNT}-${VOLK_GIT_HASH}")
+endif()
+
+########################################################################
+# Use the logic below to set the version constants
+########################################################################
+if("${MINOR_VERSION}" STREQUAL "git")
+ # VERSION: 1.0git-xxx-gxxxxxxxx
+ # DOCVER: 1.0git
+ # LIBVER: 1.0git
+ set(VERSION "${GIT_DESCRIBE}")
+ set(DOCVER "${MAJOR_VERSION}.0${MINOR_VERSION}")
+ set(LIBVER "${MAJOR_VERSION}.0${MINOR_VERSION}")
+ set(RC_MINOR_VERSION "0")
+ set(RC_MAINT_VERSION "0")
+elseif("${MAINT_VERSION}" STREQUAL "git")
+ # VERSION: 1.xgit-xxx-gxxxxxxxx
+ # DOCVER: 1.xgit
+ # LIBVER: 1.xgit
+ set(VERSION "${GIT_DESCRIBE}")
+ set(DOCVER "${MAJOR_VERSION}.${MINOR_VERSION}${MAINT_VERSION}")
+ set(LIBVER "${MAJOR_VERSION}.${MINOR_VERSION}${MAINT_VERSION}")
+ math(EXPR RC_MINOR_VERSION "${MINOR_VERSION} - 1")
+ set(RC_MAINT_VERSION "0")
+else()
+ # This is a numbered release.
+ # VERSION: 1.1{.x}
+ # DOCVER: 1.1{.x}
+ # LIBVER: 1.1{.x}
+ if("${MAINT_VERSION}" STREQUAL "0")
+ set(VERSION "${MAJOR_VERSION}.${MINOR_VERSION}")
+ else()
+ set(VERSION "${MAJOR_VERSION}.${MINOR_VERSION}.${MAINT_VERSION}")
+ endif()
+ set(DOCVER "${VERSION}")
+ set(LIBVER "${VERSION}")
+ set(RC_MINOR_VERSION ${MINOR_VERSION})
+ set(RC_MAINT_VERSION ${MAINT_VERSION})
+endif()
--- /dev/null
+########################################################################
+# Toolchain file for building native on a ARM Cortex A8 w/ NEON
+# Usage: cmake -DCMAKE_TOOLCHAIN_FILE=<this file> <source directory>
+########################################################################
+set(CMAKE_CXX_COMPILER g++)
+set(CMAKE_C_COMPILER gcc)
+set(CMAKE_CXX_FLAGS "-march=armv7-a -mtune=cortex-a8 -mfpu=neon -mfloat-abi=softfp" CACHE STRING "" FORCE)
+set(CMAKE_C_FLAGS ${CMAKE_CXX_FLAGS} CACHE STRING "" FORCE) #same flags for C sources
--- /dev/null
+set( CMAKE_SYSTEM_NAME Linux )
+#set( CMAKE_C_COMPILER $ENV{CC} )
+#set( CMAKE_CXX_COMPILER $ENV{CXX} )
+string(REGEX MATCH "sysroots/([a-zA-Z0-9]+)" CMAKE_SYSTEM_PROCESSOR $ENV{SDKTARGETSYSROOT})
+string(REGEX REPLACE "sysroots/" "" CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR})
+set( CMAKE_CXX_FLAGS $ENV{CXXFLAGS} CACHE STRING "" FORCE )
+set( CMAKE_C_FLAGS $ENV{CFLAGS} CACHE STRING "" FORCE ) #same flags for C sources
+set( CMAKE_LDFLAGS_FLAGS ${CMAKE_CXX_FLAGS} CACHE STRING "" FORCE ) #same flags for C sources
+set( CMAKE_LIBRARY_PATH ${OECORE_TARGET_SYSROOT}/usr/lib )
+set( CMAKE_FIND_ROOT_PATH $ENV{OECORE_TARGET_SYSROOT} $ENV{OECORE_NATIVE_SYSROOT} )
+set( CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER )
+set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY )
+set( CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY )
+set ( ORC_INCLUDE_DIRS $ENV{OECORE_TARGET_SYSROOT}/usr/include/orc-0.4 )
+set ( ORC_LIBRARY_DIRS $ENV{OECORE_TARGET_SYSROOT}/usr/lib )
--- /dev/null
+# http://www.vtk.org/Wiki/CMake_FAQ#Can_I_do_.22make_uninstall.22_with_CMake.3F
+
+IF(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+ MESSAGE(FATAL_ERROR "Cannot find install manifest: \"@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt\"")
+ENDIF(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+
+FILE(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files)
+STRING(REGEX REPLACE "\n" ";" files "${files}")
+FOREACH(file ${files})
+ MESSAGE(STATUS "Uninstalling \"$ENV{DESTDIR}${file}\"")
+ IF(EXISTS "$ENV{DESTDIR}${file}")
+ EXEC_PROGRAM(
+ "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\""
+ OUTPUT_VARIABLE rm_out
+ RETURN_VALUE rm_retval
+ )
+ IF(NOT "${rm_retval}" STREQUAL 0)
+ MESSAGE(FATAL_ERROR "Problem when removing \"$ENV{DESTDIR}${file}\"")
+ ENDIF(NOT "${rm_retval}" STREQUAL 0)
+ ELSEIF(IS_SYMLINK "$ENV{DESTDIR}${file}")
+ EXEC_PROGRAM(
+ "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\""
+ OUTPUT_VARIABLE rm_out
+ RETURN_VALUE rm_retval
+ )
+ IF(NOT "${rm_retval}" STREQUAL 0)
+ MESSAGE(FATAL_ERROR "Problem when removing \"$ENV{DESTDIR}${file}\"")
+ ENDIF(NOT "${rm_retval}" STREQUAL 0)
+ ELSE(EXISTS "$ENV{DESTDIR}${file}")
+ MESSAGE(STATUS "File \"$ENV{DESTDIR}${file}\" does not exist.")
+ ENDIF(EXISTS "$ENV{DESTDIR}${file}")
+ENDFOREACH(file)
--- /dev/null
+#ifndef _MSC_VER // [
+#error "Use this header only with Microsoft Visual C++ compilers!"
+#endif // _MSC_VER ]
+
+#ifndef _MSC_CONFIG_H_ // [
+#define _MSC_CONFIG_H_
+
+////////////////////////////////////////////////////////////////////////
+// enable inline functions for C code
+////////////////////////////////////////////////////////////////////////
+#ifndef __cplusplus
+# define inline __inline
+#endif
+
+////////////////////////////////////////////////////////////////////////
+// signed size_t
+////////////////////////////////////////////////////////////////////////
+#include <stddef.h>
+typedef ptrdiff_t ssize_t;
+
+////////////////////////////////////////////////////////////////////////
+// rint functions
+////////////////////////////////////////////////////////////////////////
+#include <math.h>
+static inline long lrint(double x){return (long)(x > 0.0 ? x + 0.5 : x - 0.5);}
+static inline long lrintf(float x){return (long)(x > 0.0f ? x + 0.5f : x - 0.5f);}
+static inline long long llrint(double x){return (long long)(x > 0.0 ? x + 0.5 : x - 0.5);}
+static inline long long llrintf(float x){return (long long)(x > 0.0f ? x + 0.5f : x - 0.5f);}
+static inline double rint(double x){return (x > 0.0)? floor(x + 0.5) : ceil(x - 0.5);}
+static inline float rintf(float x){return (x > 0.0f)? floorf(x + 0.5f) : ceilf(x - 0.5f);}
+
+////////////////////////////////////////////////////////////////////////
+// math constants
+////////////////////////////////////////////////////////////////////////
+#define INFINITY HUGE_VAL
+
+# define M_E 2.7182818284590452354 /* e */
+# define M_LOG2E 1.4426950408889634074 /* log_2 e */
+# define M_LOG10E 0.43429448190325182765 /* log_10 e */
+# define M_LN2 0.69314718055994530942 /* log_e 2 */
+# define M_LN10 2.30258509299404568402 /* log_e 10 */
+# define M_PI 3.14159265358979323846 /* pi */
+# define M_PI_2 1.57079632679489661923 /* pi/2 */
+# define M_PI_4 0.78539816339744830962 /* pi/4 */
+# define M_1_PI 0.31830988618379067154 /* 1/pi */
+# define M_2_PI 0.63661977236758134308 /* 2/pi */
+# define M_2_SQRTPI 1.12837916709551257390 /* 2/sqrt(pi) */
+# define M_SQRT2 1.41421356237309504880 /* sqrt(2) */
+# define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */
+
+////////////////////////////////////////////////////////////////////////
+// random and srandom
+////////////////////////////////////////////////////////////////////////
+#include <stdlib.h>
+static inline long int random (void) { return rand(); }
+static inline void srandom (unsigned int seed) { srand(seed); }
+
+#endif // _MSC_CONFIG_H_ ]
--- /dev/null
+// ISO C9x compliant inttypes.h for Microsoft Visual Studio
+// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
+//
+// Copyright (c) 2006 Alexander Chemeris
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. The name of the author may be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef _MSC_VER // [
+#error "Use this header only with Microsoft Visual C++ compilers!"
+#endif // _MSC_VER ]
+
+#ifndef _MSC_INTTYPES_H_ // [
+#define _MSC_INTTYPES_H_
+
+#if _MSC_VER > 1000
+#pragma once
+#endif
+
+#include <stdint.h>
+
+// 7.8 Format conversion of integer types
+
+typedef struct {
+ intmax_t quot;
+ intmax_t rem;
+} imaxdiv_t;
+
+// 7.8.1 Macros for format specifiers
+
+// The fprintf macros for signed integers are:
+#define PRId8 "d"
+#define PRIi8 "i"
+#define PRIdLEAST8 "d"
+#define PRIiLEAST8 "i"
+#define PRIdFAST8 "d"
+#define PRIiFAST8 "i"
+
+#define PRId16 "hd"
+#define PRIi16 "hi"
+#define PRIdLEAST16 "hd"
+#define PRIiLEAST16 "hi"
+#define PRIdFAST16 "hd"
+#define PRIiFAST16 "hi"
+
+#define PRId32 "I32d"
+#define PRIi32 "I32i"
+#define PRIdLEAST32 "I32d"
+#define PRIiLEAST32 "I32i"
+#define PRIdFAST32 "I32d"
+#define PRIiFAST32 "I32i"
+
+#define PRId64 "I64d"
+#define PRIi64 "I64i"
+#define PRIdLEAST64 "I64d"
+#define PRIiLEAST64 "I64i"
+#define PRIdFAST64 "I64d"
+#define PRIiFAST64 "I64i"
+
+#define PRIdMAX "I64d"
+#define PRIiMAX "I64i"
+
+#define PRIdPTR "Id"
+#define PRIiPTR "Ii"
+
+// The fprintf macros for unsigned integers are:
+#define PRIo8 "o"
+#define PRIu8 "u"
+#define PRIx8 "x"
+#define PRIX8 "X"
+#define PRIoLEAST8 "o"
+#define PRIuLEAST8 "u"
+#define PRIxLEAST8 "x"
+#define PRIXLEAST8 "X"
+#define PRIoFAST8 "o"
+#define PRIuFAST8 "u"
+#define PRIxFAST8 "x"
+#define PRIXFAST8 "X"
+
+#define PRIo16 "ho"
+#define PRIu16 "hu"
+#define PRIx16 "hx"
+#define PRIX16 "hX"
+#define PRIoLEAST16 "ho"
+#define PRIuLEAST16 "hu"
+#define PRIxLEAST16 "hx"
+#define PRIXLEAST16 "hX"
+#define PRIoFAST16 "ho"
+#define PRIuFAST16 "hu"
+#define PRIxFAST16 "hx"
+#define PRIXFAST16 "hX"
+
+#define PRIo32 "I32o"
+#define PRIu32 "I32u"
+#define PRIx32 "I32x"
+#define PRIX32 "I32X"
+#define PRIoLEAST32 "I32o"
+#define PRIuLEAST32 "I32u"
+#define PRIxLEAST32 "I32x"
+#define PRIXLEAST32 "I32X"
+#define PRIoFAST32 "I32o"
+#define PRIuFAST32 "I32u"
+#define PRIxFAST32 "I32x"
+#define PRIXFAST32 "I32X"
+
+#define PRIo64 "I64o"
+#define PRIu64 "I64u"
+#define PRIx64 "I64x"
+#define PRIX64 "I64X"
+#define PRIoLEAST64 "I64o"
+#define PRIuLEAST64 "I64u"
+#define PRIxLEAST64 "I64x"
+#define PRIXLEAST64 "I64X"
+#define PRIoFAST64 "I64o"
+#define PRIuFAST64 "I64u"
+#define PRIxFAST64 "I64x"
+#define PRIXFAST64 "I64X"
+
+#define PRIoMAX "I64o"
+#define PRIuMAX "I64u"
+#define PRIxMAX "I64x"
+#define PRIXMAX "I64X"
+
+#define PRIoPTR "Io"
+#define PRIuPTR "Iu"
+#define PRIxPTR "Ix"
+#define PRIXPTR "IX"
+
+// The fscanf macros for signed integers are:
+#define SCNd8 "d"
+#define SCNi8 "i"
+#define SCNdLEAST8 "d"
+#define SCNiLEAST8 "i"
+#define SCNdFAST8 "d"
+#define SCNiFAST8 "i"
+
+#define SCNd16 "hd"
+#define SCNi16 "hi"
+#define SCNdLEAST16 "hd"
+#define SCNiLEAST16 "hi"
+#define SCNdFAST16 "hd"
+#define SCNiFAST16 "hi"
+
+#define SCNd32 "ld"
+#define SCNi32 "li"
+#define SCNdLEAST32 "ld"
+#define SCNiLEAST32 "li"
+#define SCNdFAST32 "ld"
+#define SCNiFAST32 "li"
+
+#define SCNd64 "I64d"
+#define SCNi64 "I64i"
+#define SCNdLEAST64 "I64d"
+#define SCNiLEAST64 "I64i"
+#define SCNdFAST64 "I64d"
+#define SCNiFAST64 "I64i"
+
+#define SCNdMAX "I64d"
+#define SCNiMAX "I64i"
+
+#ifdef _WIN64 // [
+# define SCNdPTR "I64d"
+# define SCNiPTR "I64i"
+#else // _WIN64 ][
+# define SCNdPTR "ld"
+# define SCNiPTR "li"
+#endif // _WIN64 ]
+
+// The fscanf macros for unsigned integers are:
+#define SCNo8 "o"
+#define SCNu8 "u"
+#define SCNx8 "x"
+#define SCNX8 "X"
+#define SCNoLEAST8 "o"
+#define SCNuLEAST8 "u"
+#define SCNxLEAST8 "x"
+#define SCNXLEAST8 "X"
+#define SCNoFAST8 "o"
+#define SCNuFAST8 "u"
+#define SCNxFAST8 "x"
+#define SCNXFAST8 "X"
+
+#define SCNo16 "ho"
+#define SCNu16 "hu"
+#define SCNx16 "hx"
+#define SCNX16 "hX"
+#define SCNoLEAST16 "ho"
+#define SCNuLEAST16 "hu"
+#define SCNxLEAST16 "hx"
+#define SCNXLEAST16 "hX"
+#define SCNoFAST16 "ho"
+#define SCNuFAST16 "hu"
+#define SCNxFAST16 "hx"
+#define SCNXFAST16 "hX"
+
+#define SCNo32 "lo"
+#define SCNu32 "lu"
+#define SCNx32 "lx"
+#define SCNX32 "lX"
+#define SCNoLEAST32 "lo"
+#define SCNuLEAST32 "lu"
+#define SCNxLEAST32 "lx"
+#define SCNXLEAST32 "lX"
+#define SCNoFAST32 "lo"
+#define SCNuFAST32 "lu"
+#define SCNxFAST32 "lx"
+#define SCNXFAST32 "lX"
+
+#define SCNo64 "I64o"
+#define SCNu64 "I64u"
+#define SCNx64 "I64x"
+#define SCNX64 "I64X"
+#define SCNoLEAST64 "I64o"
+#define SCNuLEAST64 "I64u"
+#define SCNxLEAST64 "I64x"
+#define SCNXLEAST64 "I64X"
+#define SCNoFAST64 "I64o"
+#define SCNuFAST64 "I64u"
+#define SCNxFAST64 "I64x"
+#define SCNXFAST64 "I64X"
+
+#define SCNoMAX "I64o"
+#define SCNuMAX "I64u"
+#define SCNxMAX "I64x"
+#define SCNXMAX "I64X"
+
+#ifdef _WIN64 // [
+# define SCNoPTR "I64o"
+# define SCNuPTR "I64u"
+# define SCNxPTR "I64x"
+# define SCNXPTR "I64X"
+#else // _WIN64 ][
+# define SCNoPTR "lo"
+# define SCNuPTR "lu"
+# define SCNxPTR "lx"
+# define SCNXPTR "lX"
+#endif // _WIN64 ]
+
+// 7.8.2 Functions for greatest-width integer types
+
+// 7.8.2.1 The imaxabs function
+#define imaxabs _abs64
+
+// 7.8.2.2 The imaxdiv function
+
+// This is modified version of div() function from Microsoft's div.c found
+// in %MSVC.NET%\crt\src\div.c
+#ifdef STATIC_IMAXDIV // [
+static
+#else // STATIC_IMAXDIV ][
+_inline
+#endif // STATIC_IMAXDIV ]
+imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom)
+{
+ imaxdiv_t result;
+
+ result.quot = numer / denom;
+ result.rem = numer % denom;
+
+ if (numer < 0 && result.rem > 0) {
+ // did division wrong; must fix up
+ ++result.quot;
+ result.rem -= denom;
+ }
+
+ return result;
+}
+
+// 7.8.2.3 The strtoimax and strtoumax functions
+#define strtoimax _strtoi64
+#define strtoumax _strtoui64
+
+// 7.8.2.4 The wcstoimax and wcstoumax functions
+#define wcstoimax _wcstoi64
+#define wcstoumax _wcstoui64
+
+
+#endif // _MSC_INTTYPES_H_ ]
--- /dev/null
+/*
+ * Copyright (C) 2005, 2006 Apple Computer, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public License
+ * along with this library; see the file COPYING.LIB. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ *
+ */
+
+#ifndef STDBOOL_WIN32_H
+#define STDBOOL_WIN32_H
+
+#ifndef _MSC_VER // [
+#error "Use this header only with Microsoft Visual C++ compilers!"
+#endif // _MSC_VER ]
+
+#ifndef __cplusplus
+
+typedef unsigned char bool;
+
+#define true 1
+#define false 0
+
+#ifndef CASSERT
+#define CASSERT(exp, name) typedef int dummy##name [(exp) ? 1 : -1];
+#endif
+
+CASSERT(sizeof(bool) == 1, bool_is_one_byte)
+CASSERT(true, true_is_true)
+CASSERT(!false, false_is_false)
+
+#endif
+
+#endif
--- /dev/null
+// ISO C9x compliant stdint.h for Microsoft Visual Studio
+// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
+//
+// Copyright (c) 2006-2008 Alexander Chemeris
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. The name of the author may be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef _MSC_VER // [
+#error "Use this header only with Microsoft Visual C++ compilers!"
+#endif // _MSC_VER ]
+
+#ifndef _MSC_STDINT_H_ // [
+#define _MSC_STDINT_H_
+
+#if _MSC_VER > 1000
+#pragma once
+#endif
+
+#include <limits.h>
+
+// For Visual Studio 6 in C++ mode and for many Visual Studio versions when
+// compiling for ARM we should wrap <wchar.h> include with 'extern "C++" {}'
+// or compiler give many errors like this:
+// error C2733: second C linkage of overloaded function 'wmemchr' not allowed
+#ifdef __cplusplus
+extern "C" {
+#endif
+# include <wchar.h>
+#ifdef __cplusplus
+}
+#endif
+
+// Define _W64 macros to mark types changing their size, like intptr_t.
+#ifndef _W64
+# if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300
+# define _W64 __w64
+# else
+# define _W64
+# endif
+#endif
+
+
+// 7.18.1 Integer types
+
+// 7.18.1.1 Exact-width integer types
+
+// Visual Studio 6 and Embedded Visual C++ 4 doesn't
+// realize that, e.g. char has the same size as __int8
+// so we give up on __intX for them.
+#if (_MSC_VER < 1300)
+ typedef signed char int8_t;
+ typedef signed short int16_t;
+ typedef signed int int32_t;
+ typedef unsigned char uint8_t;
+ typedef unsigned short uint16_t;
+ typedef unsigned int uint32_t;
+#else
+ typedef signed __int8 int8_t;
+ typedef signed __int16 int16_t;
+ typedef signed __int32 int32_t;
+ typedef unsigned __int8 uint8_t;
+ typedef unsigned __int16 uint16_t;
+ typedef unsigned __int32 uint32_t;
+#endif
+typedef signed __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+
+
+// 7.18.1.2 Minimum-width integer types
+typedef int8_t int_least8_t;
+typedef int16_t int_least16_t;
+typedef int32_t int_least32_t;
+typedef int64_t int_least64_t;
+typedef uint8_t uint_least8_t;
+typedef uint16_t uint_least16_t;
+typedef uint32_t uint_least32_t;
+typedef uint64_t uint_least64_t;
+
+// 7.18.1.3 Fastest minimum-width integer types
+typedef int8_t int_fast8_t;
+typedef int16_t int_fast16_t;
+typedef int32_t int_fast32_t;
+typedef int64_t int_fast64_t;
+typedef uint8_t uint_fast8_t;
+typedef uint16_t uint_fast16_t;
+typedef uint32_t uint_fast32_t;
+typedef uint64_t uint_fast64_t;
+
+// 7.18.1.4 Integer types capable of holding object pointers
+#ifdef _WIN64 // [
+ typedef signed __int64 intptr_t;
+ typedef unsigned __int64 uintptr_t;
+#else // _WIN64 ][
+ typedef _W64 signed int intptr_t;
+ typedef _W64 unsigned int uintptr_t;
+#endif // _WIN64 ]
+
+// 7.18.1.5 Greatest-width integer types
+typedef int64_t intmax_t;
+typedef uint64_t uintmax_t;
+
+
+// 7.18.2 Limits of specified-width integer types
+
+#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259
+
+// 7.18.2.1 Limits of exact-width integer types
+#define INT8_MIN ((int8_t)_I8_MIN)
+#define INT8_MAX _I8_MAX
+#define INT16_MIN ((int16_t)_I16_MIN)
+#define INT16_MAX _I16_MAX
+#define INT32_MIN ((int32_t)_I32_MIN)
+#define INT32_MAX _I32_MAX
+#define INT64_MIN ((int64_t)_I64_MIN)
+#define INT64_MAX _I64_MAX
+#define UINT8_MAX _UI8_MAX
+#define UINT16_MAX _UI16_MAX
+#define UINT32_MAX _UI32_MAX
+#define UINT64_MAX _UI64_MAX
+
+// 7.18.2.2 Limits of minimum-width integer types
+#define INT_LEAST8_MIN INT8_MIN
+#define INT_LEAST8_MAX INT8_MAX
+#define INT_LEAST16_MIN INT16_MIN
+#define INT_LEAST16_MAX INT16_MAX
+#define INT_LEAST32_MIN INT32_MIN
+#define INT_LEAST32_MAX INT32_MAX
+#define INT_LEAST64_MIN INT64_MIN
+#define INT_LEAST64_MAX INT64_MAX
+#define UINT_LEAST8_MAX UINT8_MAX
+#define UINT_LEAST16_MAX UINT16_MAX
+#define UINT_LEAST32_MAX UINT32_MAX
+#define UINT_LEAST64_MAX UINT64_MAX
+
+// 7.18.2.3 Limits of fastest minimum-width integer types
+#define INT_FAST8_MIN INT8_MIN
+#define INT_FAST8_MAX INT8_MAX
+#define INT_FAST16_MIN INT16_MIN
+#define INT_FAST16_MAX INT16_MAX
+#define INT_FAST32_MIN INT32_MIN
+#define INT_FAST32_MAX INT32_MAX
+#define INT_FAST64_MIN INT64_MIN
+#define INT_FAST64_MAX INT64_MAX
+#define UINT_FAST8_MAX UINT8_MAX
+#define UINT_FAST16_MAX UINT16_MAX
+#define UINT_FAST32_MAX UINT32_MAX
+#define UINT_FAST64_MAX UINT64_MAX
+
+// 7.18.2.4 Limits of integer types capable of holding object pointers
+#ifdef _WIN64 // [
+# define INTPTR_MIN INT64_MIN
+# define INTPTR_MAX INT64_MAX
+# define UINTPTR_MAX UINT64_MAX
+#else // _WIN64 ][
+# define INTPTR_MIN INT32_MIN
+# define INTPTR_MAX INT32_MAX
+# define UINTPTR_MAX UINT32_MAX
+#endif // _WIN64 ]
+
+// 7.18.2.5 Limits of greatest-width integer types
+#define INTMAX_MIN INT64_MIN
+#define INTMAX_MAX INT64_MAX
+#define UINTMAX_MAX UINT64_MAX
+
+// 7.18.3 Limits of other integer types
+
+#ifdef _WIN64 // [
+# define PTRDIFF_MIN _I64_MIN
+# define PTRDIFF_MAX _I64_MAX
+#else // _WIN64 ][
+# define PTRDIFF_MIN _I32_MIN
+# define PTRDIFF_MAX _I32_MAX
+#endif // _WIN64 ]
+
+#define SIG_ATOMIC_MIN INT_MIN
+#define SIG_ATOMIC_MAX INT_MAX
+
+#ifndef SIZE_MAX // [
+# ifdef _WIN64 // [
+# define SIZE_MAX _UI64_MAX
+# else // _WIN64 ][
+# define SIZE_MAX _UI32_MAX
+# endif // _WIN64 ]
+#endif // SIZE_MAX ]
+
+// WCHAR_MIN and WCHAR_MAX are also defined in <wchar.h>
+#ifndef WCHAR_MIN // [
+# define WCHAR_MIN 0
+#endif // WCHAR_MIN ]
+#ifndef WCHAR_MAX // [
+# define WCHAR_MAX _UI16_MAX
+#endif // WCHAR_MAX ]
+
+#define WINT_MIN 0
+#define WINT_MAX _UI16_MAX
+
+#endif // __STDC_LIMIT_MACROS ]
+
+
+// 7.18.4 Limits of other integer types
+
+#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260
+
+// 7.18.4.1 Macros for minimum-width integer constants
+
+#define INT8_C(val) val##i8
+#define INT16_C(val) val##i16
+#define INT32_C(val) val##i32
+#define INT64_C(val) val##i64
+
+#define UINT8_C(val) val##ui8
+#define UINT16_C(val) val##ui16
+#define UINT32_C(val) val##ui32
+#define UINT64_C(val) val##ui64
+
+// 7.18.4.2 Macros for greatest-width integer constants
+#ifndef INTMAX_C
+#define INTMAX_C INT64_C
+#endif
+#ifndef UINTMAX_C
+#define UINTMAX_C UINT64_C
+#endif
+
+#endif // __STDC_CONSTANT_MACROS ]
+
+
+#endif // _MSC_STDINT_H_ ]
--- /dev/null
+/*! \page extending_volk Extending VOLK
+
+There are two primary routes for extending VOLK for your own use. The
+preferred route is by writing kernels and proto-kernels as part of this
+repository and sending patches upstream. The alternative is creating your
+own VOLK module. The information for modifying this repository will be
+useful in both cases.
+
+## Modifying this repository
+
+### Adding kernels
+
+Adding kernels refers to introducing a new function to the VOLK API that is
+presumably a useful math function/operation. The first step is to create
+the file in volk/kernels/volk. Follow the naming scheme provided in the
+VOLK terms and techniques page. First create the generic protokernel.
+
+The generic protokernel should be written in plain C using explicitly sized
+types from stdint.h or volk_complex.h when appropriate. volk_complex.h
+includes explicitly sized complex types for floats and ints. The name of
+the generic kernel should be volk_signature_from_file_generic. If multiple
+versions of the generic kernel exist then a description can be appended to
+generic_, but it is not required to use alignment flags in the generic
+protokernel name. It is required to surround the entire generic function
+with preprocessor ifdef fences on the symbol LV_HAVE_GENERIC.
+
+Finally, add the kernel to the list of test cases in volk/lib/kernel_tests.h.
+Many kernels should be able to use the default test parameters, but if yours
+requires a lower tolerance, specific vector length, or other test parameters
+just create a new instance of volk_test_params_t for your kernel.
+
+### Adding protokernels
+
+The primary purpose of VOLK is to have multiple implementations of an operation
+tuned for a specific CPU architecture. Ideally there is at least one
+protokernel of each kernel for every architecture that VOLK supports.
+The pattern for protokernel naming is volk_kernel_signature_architecture_nick.
+The architecture should be one of the supported VOLK architectures. The nick is
+an optional name to distinguish between multiple implementations for a
+particular architecture.
+
+Architecture specific protokernels can be written in one of three ways.
+The first approach should always be to use compiler intrinsic functions.
+The second and third approaches are using either in-line assembly or
+assembly with .S files. Both methods of writing assembly exist in VOLK and
+should yield equivalent performance; which method you might choose is a
+matter of opinion. Regardless of the actual method the public function should
+be declared in the kernel header surrounded by ifdef fences on the symbol that
+fits the architecture implementation.
+
+#### Compiler Intrinsics
+
+Compiler intrinsics should be treated as functions that map to a specific
+assembly instruction. Most VOLK kernels take the form of a loop that iterates
+through a vector. Form a loop that iterates on a number of items that is natural
+for the architecture and then use compiler intrinsics to do the math for your
+operation or algorithm. Include the appropriate header inside the ifdef fences,
+but before your protokernel declaration.
+
+
+#### In-line Assembly
+
+In-line assembly uses a compiler macro to include verbatim assembly with C code.
+The process of in-line assembly protokernels is very similar to protokernels
+based on intrinsics.
+
+#### Assembly with .S files
+
+To write pure assembly protokernels, first declare the function name in the
+kernel header file the same way as any other protokernel, but include the extern
+keyword. Second, create a file (one for each protokernel) in
+volk/kernels/volk/asm/$arch. Disassemble another protokernel and copy the
+disassembled code in to this file to bootstrap a working implementation. Often
+the disassembled code can be hand-tuned to improve performance.
+
+## VOLK Modules
+
+VOLK has a concept of modules. Each module is an independent VOLK tree. Modules
+can be managed with the volk_modtool application. At a high level the module is
+a clone of all of the VOLK machinery without kernels. volk_modtool also makes it
+easy to copy kernels to a module.
+
+Kernels and protokernels are added to your own VOLK module the same way they are
+added to this repository, which was described in the previous section.
+
+*/
+
--- /dev/null
+/*! \page kernels Kernels
+
+\li \subpage volk_32fc_x2_dot_prod_32fc
+\li \subpage volk_32fc_32f_dot_prod_32fc
+\li \subpage volk_32f_x2_dot_prod_32f
+\li \subpage volk_32f_x2_dot_prod_16i
+\li \subpage volk_16i_32fc_dot_prod_32fc
+\li \subpage volk_32fc_x2_conjugate_dot_prod_32fc
+\li \subpage volk_16u_byteswap
+\li \subpage volk_32f_convert_64f
+\li \subpage volk_32f_s32f_32f_fm_detect_32f
+\li \subpage volk_32f_s32f_normalize
+\li \subpage volk_32f_s32f_stddev_32f
+\li \subpage volk_32fc_s32f_atan2_32f
+\li \subpage volk_32fc_s32fc_multiply_32fc
+\li \subpage volk_32fc_s32fc_x2_rotator_32fc
+\li \subpage volk_32u_byteswap
+\li \subpage volk_64f_convert_32f
+\li \subpage volk_64u_byteswap
+\li \subpage volk_8ic_x2_s32f_multiply_conjugate_32fc
+\li \subpage volk_16i_branch_4_state_8
+\li \subpage volk_16ic_deinterleave_16i_x2
+\li \subpage volk_16ic_deinterleave_real_16i
+\li \subpage volk_16ic_deinterleave_real_8i
+\li \subpage volk_16ic_magnitude_16i
+\li \subpage volk_16i_convert_8i
+\li \subpage volk_16ic_s32f_deinterleave_32f_x2
+\li \subpage volk_16ic_s32f_deinterleave_real_32f
+\li \subpage volk_16ic_s32f_magnitude_32f
+\li \subpage volk_16i_max_star_16i
+\li \subpage volk_16i_max_star_horizontal_16i
+\li \subpage volk_16i_permute_and_scalar_add
+\li \subpage volk_16i_s32f_convert_32f
+\li \subpage volk_16i_x4_quad_max_star_16i
+\li \subpage volk_16i_x5_add_quad_16i_x4
+\li \subpage volk_32f_accumulator_s32f
+\li \subpage volk_32f_acos_32f
+\li \subpage volk_32f_asin_32f
+\li \subpage volk_32f_atan_32f
+\li \subpage volk_32f_binary_slicer_32i
+\li \subpage volk_32f_binary_slicer_8i
+\li \subpage volk_32fc_32f_multiply_32fc
+\li \subpage volk_32fc_conjugate_32fc
+\li \subpage volk_32fc_deinterleave_32f_x2
+\li \subpage volk_32fc_deinterleave_64f_x2
+\li \subpage volk_32fc_deinterleave_imag_32f
+\li \subpage volk_32fc_deinterleave_real_32f
+\li \subpage volk_32fc_deinterleave_real_64f
+\li \subpage volk_32fc_index_max_16u
+\li \subpage volk_32fc_index_max_32u
+\li \subpage volk_32fc_magnitude_32f
+\li \subpage volk_32fc_magnitude_squared_32f
+\li \subpage volk_32f_cos_32f
+\li \subpage volk_32fc_s32f_deinterleave_real_16i
+\li \subpage volk_32fc_s32f_magnitude_16i
+\li \subpage volk_32fc_s32f_power_32fc
+\li \subpage volk_32fc_s32f_power_spectrum_32f
+\li \subpage volk_32fc_s32f_x2_power_spectral_density_32f
+\li \subpage volk_32fc_x2_multiply_32fc
+\li \subpage volk_32fc_x2_multiply_conjugate_32fc
+\li \subpage volk_32fc_x2_s32f_square_dist_scalar_mult_32f
+\li \subpage volk_32fc_x2_square_dist_32f
+\li \subpage volk_32f_expfast_32f
+\li \subpage volk_32f_index_max_16u
+\li \subpage volk_32f_index_max_32u
+\li \subpage volk_32f_invsqrt_32f
+\li \subpage volk_32f_log2_32f
+\li \subpage volk_32f_s32f_calc_spectral_noise_floor_32f
+\li \subpage volk_32f_s32f_convert_16i
+\li \subpage volk_32f_s32f_convert_32i
+\li \subpage volk_32f_s32f_convert_8i
+\li \subpage volk_32f_s32f_multiply_32f
+\li \subpage volk_32f_s32f_power_32f
+\li \subpage volk_32f_sin_32f
+\li \subpage volk_32f_sqrt_32f
+\li \subpage volk_32f_stddev_and_mean_32f_x2
+\li \subpage volk_32f_tan_32f
+\li \subpage volk_32f_tanh_32f
+\li \subpage volk_32f_x2_add_32f
+\li \subpage volk_32f_x2_divide_32f
+\li \subpage volk_32f_x2_interleave_32fc
+\li \subpage volk_32f_x2_max_32f
+\li \subpage volk_32f_x2_min_32f
+\li \subpage volk_32f_x2_multiply_32f
+\li \subpage volk_32f_x2_pow_32f
+\li \subpage volk_32f_x2_s32f_interleave_16ic
+\li \subpage volk_32f_x2_subtract_32f
+\li \subpage volk_32f_x3_sum_of_poly_32f
+\li \subpage volk_32i_s32f_convert_32f
+\li \subpage volk_32i_x2_and_32i
+\li \subpage volk_32i_x2_or_32i
+\li \subpage volk_32u_popcnt
+\li \subpage volk_64f_x2_max_64f
+\li \subpage volk_64f_x2_min_64f
+\li \subpage volk_64u_popcnt
+\li \subpage volk_8ic_deinterleave_16i_x2
+\li \subpage volk_8ic_deinterleave_real_16i
+\li \subpage volk_8ic_deinterleave_real_8i
+\li \subpage volk_8i_convert_16i
+\li \subpage volk_8ic_s32f_deinterleave_32f_x2
+\li \subpage volk_8ic_s32f_deinterleave_real_32f
+\li \subpage volk_8i_s32f_convert_32f
+\li \subpage volk_8u_x4_conv_k7_r2_8u
+
+*/
--- /dev/null
+/*! \mainpage VOLK
+
+Welcome to VOLK!
+
+VOLK is the Vector-Optimized Library of Kernels. It is a library that contains
+kernels of hand-written SIMD code for different mathematical operations. Since
+each SIMD architecture can be very different and no compiler has yet come along
+to handle vectorization properly or highly efficiently, VOLK approaches the
+problem differently.
+
+For each architecture or platform that a developer wishes to vectorize for, a
+new proto-kernel is added to VOLK. At runtime, VOLK will select the correct
+proto-kernel. In this way, the users of VOLK call a kernel for performing the
+operation that is platform/architecture agnostic. This allows us to write
+portable SIMD code.
+
+*/
--- /dev/null
+/*! \page concepts_terms_and_techniques Concepts, Terms, and Techniques
+
+This page is primarily a list of definitions and brief overview of successful
+techniques previously used to develop VOLK protokernels.
+
+## Definitions and Concepts
+
+### SIMD
+
+SIMD stands for Single Instruction Multiple Data. Leveraging SIMD instructions
+is the primary optimization in VOLK.
+
+### Architecture
+
+A VOLK architecture is normally called an Instruction Set Architecture (ISA).
+The architectures we target in VOLK usually have SIMD instructions.
+
+### Vector
+
+A vector in VOLK is the same as a C array. It sometimes, but not always
+coincides with the mathematical definition of a vector.
+
+### Kernel
+
+The 'kernel' part of the VOLK name comes from the high performance computing
+use of the word. In this context it is the inner loop of a vector operation.
+Since we don't use the word vector in the math sense a vector operation is an
+operation that is performed on a C array.
+
+### Protokernel
+
+A protokernel is an implementation of a kernel. Every kernel has a 'generic'
+protokernel that is implemented in C. Other protokernels are optimized for a
+particular architecture.
+
+
+## Techniques
+
+### New Kernels
+
+Add new kernels to the list in lib/kernel_tests.h. This adds the kernel to
+VOLK's QA tool as well as the volk profiler. Many kernels are able to
+share test parameters, but new kernels might need new ones.
+
+If the VOLK kernel does not 'fit' the the standard set of function parameters
+expected by the volk_profile structure, you need to create a VOLK puppet
+function to help the profiler call the kernel. This is essentially due to the
+function run_volk_tests which has a limited number of function prototypes that
+it can test.
+
+### Protokernels
+
+Adding new proto-kernels (implementations of VOLK kernels for specific
+architectures) is relatively easy. In the relevant <kernel>.h file in
+the volk/include/volk/volk_<input-fingerprint_function-name_output-fingerprint>.h
+file, add a new #ifdef/#endif block for the LV_HAVE_<arch> corresponding
+to the <arch> you a working on (e.g. SSE, AVX, NEON, etc.).
+
+For example, for volk_32f_s32f_multiply_32f_u_neon:
+
+\code
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+ \brief Scalar float multiply
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param scalar the scalar value
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_u_neon(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const float* inputPtr = aVector;
+ float* outputPtr = cVector;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float32x4_t aVal, cVal;
+
+ for(number = 0; number < num_points; number++){
+ aVal = vld1q_f32(inputPtr); // Load into NEON regs
+ cVal = vmulq_n_f32 (aVal, scalar); // Do the multiply
+ vst1q_f32(outputPtr, cVal); // Store results back to output
+ inputPtr += 8;
+ outputPtr += 8;
+ }
+ for(number = quarterPoints * 4; number < num_points; number++){
+ *outputPtr++ = (*inputPtr++) * scalar;
+ }
+}
+#endif /* LV_HAVE_NEON */
+\endcode
+
+### Allocating Memory
+
+SIMD code can be very sensitive to the alignment of the vectors, which is
+generally something like a 16-byte or 32-byte alignment requirement. The
+VOLK dispatcher functions, which is what we will normally call as users of
+VOLK, makes sure that the correct aligned or unaligned version is called
+depending on the state of the vectors passed to it. However, things typically
+work faster and more efficiently when the vectors are aligned. As such, VOLK
+has memory allocate and free methods to provide us with properly aligned
+vectors. We can also ask VOLK to give us the current machine's alignment
+requirement, which makes our job even easier when porting code.
+
+To get the machine's alignment, simply call the size_t volk_get_alignment().
+
+Allocate memory using void* volk_malloc(size_t size, size_t alignment).
+
+Make sure that any memory allocated by VOLK is also freed by VOLK with volk_free(void *p).
+
+
+*/
--- /dev/null
+/*! \page using_volk Using VOLK
+
+Using VOLK in your code requires proper linking and including the correct headers. VOLK currently supports both C and C++ bindings.
+
+VOLK provides both a pkgconfig and CMake module to help configuration and
+linking. The pkfconfig file is installed to
+$install_prefix/lib/pkgconfig/volk.pc. The CMake configuration module is in
+$install_prefix/lib/cmake/volk/VolkConfig.cmake.
+
+The header in the VOLK include directory (includedir in pkgconfig,
+VOLK_INCLUDE_DIRS in cmake module) contains the header volk/volk.h defines all
+of the symbols exposed by VOLK. Alternatively individual kernel headers are in
+the same location.
+
+In most cases it is sufficient to call the dispatcher for the kernel you are using.
+For example the following code will compute the dot product between the taps and
+input vector as part of a FIR filter.
+\code
+// assume there is sufficient history in the input buffer
+for(unsigned int ii=0; ii < input_length-ntaps; ++ii) {
+ volk_32fc_32f_dot_prod_32fc(filter_output[ii], input[ii-ntaps] taps, ntaps);
+}
+\endcode
+
+*/
+
--- /dev/null
+<!-- archs appear in order of significance for blind, de-facto version ordering -->
+<grammar>
+
+<arch name="generic"> <!-- name is required-->
+</arch>
+
+<arch name="softfp">
+ <flag compiler="gnu">-mfloat-abi=softfp</flag>
+</arch>
+
+<arch name="hardfp">
+ <flag compiler="gnu">-mfloat-abi=hard</flag>
+</arch>
+
+<arch name="neon">
+ <flag compiler="gnu">-mfpu=neon</flag>
+ <flag compiler="gnu">-funsafe-math-optimizations</flag>
+ <alignment>16</alignment>
+ <check name="has_neon"></check>
+</arch>
+
+<arch name="32">
+ <flag compiler="gnu">-m32</flag>
+</arch>
+
+<arch name="64">
+ <check name="check_extended_cpuid">
+ <param>0x80000001</param>
+ </check>
+ <check name="cpuid_x86_bit"> <!-- checks to see if a bit is set -->
+ <param>3</param> <!-- eax, ebx, ecx, [edx] -->
+ <param>0x80000001</param> <!-- cpuid operation -->
+ <param>29</param> <!-- bit shift -->
+ </check>
+ <flag compiler="gnu">-m64</flag>
+ <flag compiler="clang">-m64</flag>
+</arch>
+
+<arch name="3dnow">
+ <check name="cpuid_x86_bit">
+ <param>3</param>
+ <param>0x80000001</param>
+ <param>31</param>
+ </check>
+ <flag compiler="gnu">-m3dnow</flag>
+ <flag compiler="clang">-m3dnow</flag>
+ <alignment>8</alignment>
+</arch>
+
+<arch name="abm">
+ <check name="cpuid_x86_bit">
+ <param>3</param>
+ <param>0x80000001</param>
+ <param>5</param>
+ </check>
+ <flag compiler="gnu">-msse4.2</flag>
+ <flag compiler="clang">-msse4.2</flag>
+ <alignment>16</alignment>
+</arch>
+
+<arch name="popcount">
+ <check name="cpuid_x86_bit">
+ <param>2</param>
+ <param>0x00000001</param>
+ <param>23</param>
+ </check>
+ <flag compiler="gnu">-mpopcnt</flag>
+ <flag compiler="clang">-mpopcnt</flag>
+ <flag compiler="msvc">/arch:AVX</flag>
+</arch>
+
+<arch name="mmx">
+ <check name="cpuid_x86_bit">
+ <param>3</param>
+ <param>0x00000001</param>
+ <param>23</param>
+ </check>
+ <flag compiler="gnu">-mmmx</flag>
+ <flag compiler="clang">-mmmx</flag>
+ <flag compiler="msvc">/arch:SSE</flag>
+ <alignment>8</alignment>
+</arch>
+
+<arch name="fma">
+ <check name="cpuid_x86_bit">
+ <param>2</param>
+ <param>0x00000001</param>
+ <param>12</param>
+ </check>
+ <flag compiler="gnu">-mfma</flag>
+ <flag compiler="msvc">/arch:AVX2</flag>
+ <alignment>32</alignment>
+</arch>
+
+<arch name="sse">
+ <check name="cpuid_x86_bit">
+ <param>3</param>
+ <param>0x00000001</param>
+ <param>25</param>
+ </check>
+ <flag compiler="gnu">-msse</flag>
+ <flag compiler="clang">-msse</flag>
+ <flag compiler="msvc">/arch:SSE</flag>
+ <environment>_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);</environment>
+ <include>xmmintrin.h</include>
+ <alignment>16</alignment>
+</arch>
+
+<arch name="sse2">
+ <check name="cpuid_x86_bit">
+ <param>3</param>
+ <param>0x00000001</param>
+ <param>26</param>
+ </check>
+ <flag compiler="gnu">-msse2</flag>
+ <flag compiler="clang">-msse2</flag>
+ <flag compiler="msvc">/arch:SSE2</flag>
+ <alignment>16</alignment>
+</arch>
+
+<arch name="orc">
+</arch>
+
+<!-- it's here for overrule stuff. -->
+<arch name="norc">
+</arch>
+
+<arch name="sse3">
+ <check name="cpuid_x86_bit">
+ <param>2</param>
+ <param>0x00000001</param>
+ <param>0</param>
+ </check>
+ <flag compiler="gnu">-msse3</flag>
+ <flag compiler="clang">-msse3</flag>
+ <flag compiler="msvc">/arch:AVX</flag>
+ <environment>_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);</environment>
+ <include>pmmintrin.h</include>
+ <alignment>16</alignment>
+</arch>
+
+<arch name="ssse3">
+ <check name="cpuid_x86_bit">
+ <param>2</param>
+ <param>0x00000001</param>
+ <param>9</param>
+ </check>
+ <flag compiler="gnu">-mssse3</flag>
+ <flag compiler="clang">-mssse3</flag>
+ <flag compiler="msvc">/arch:AVX</flag>
+ <alignment>16</alignment>
+</arch>
+
+<arch name="sse4_a">
+ <check name="cpuid_x86_bit">
+ <param>2</param>
+ <param>0x80000001</param>
+ <param>6</param>
+ </check>
+ <flag compiler="gnu">-msse4a</flag>
+ <flag compiler="clang">-msse4a</flag>
+ <alignment>16</alignment>
+</arch>
+
+<arch name="sse4_1">
+ <check name="cpuid_x86_bit">
+ <param>2</param>
+ <param>0x00000001</param>
+ <param>19</param>
+ </check>
+ <flag compiler="gnu">-msse4.1</flag>
+ <flag compiler="clang">-msse4.1</flag>
+ <flag compiler="msvc">/arch:AVX</flag>
+ <alignment>16</alignment>
+</arch>
+
+<arch name="sse4_2">
+ <check name="cpuid_x86_bit">
+ <param>2</param>
+ <param>0x00000001</param>
+ <param>20</param>
+ </check>
+ <flag compiler="gnu">-msse4.2</flag>
+ <flag compiler="clang">-msse4.2</flag>
+ <flag compiler="msvc">/arch:AVX</flag>
+ <alignment>16</alignment>
+</arch>
+
+<arch name="avx">
+ <check name="cpuid_x86_bit">
+ <param>2</param>
+ <param>0x00000001</param>
+ <param>28</param>
+ </check>
+ <!-- check to make sure that xgetbv is enabled in OS -->
+ <check name="cpuid_x86_bit">
+ <param>2</param>
+ <param>0x00000001</param>
+ <param>27</param>
+ </check>
+ <!-- check to see that the OS has enabled AVX -->
+ <check name="get_avx_enabled"></check>
+ <flag compiler="gnu">-mavx</flag>
+ <flag compiler="clang">-mavx</flag>
+ <flag compiler="msvc">/arch:AVX</flag>
+ <alignment>32</alignment>
+</arch>
+
+<arch name="avx2">
+ <check name="cpuid_count_x86_bit">
+ <param>7</param>
+ <param>0</param>
+ <param>1</param>
+ <param>5</param>
+ </check>
+ <!-- check to make sure that xgetbv is enabled in OS -->
+ <check name="cpuid_x86_bit">
+ <param>2</param>
+ <param>0x00000001</param>
+ <param>27</param>
+ </check>
+ <!-- check to see that the OS has enabled AVX2 -->
+ <check name="get_avx2_enabled"></check>
+ <flag compiler="gnu">-mavx2</flag>
+ <flag compiler="msvc">/arch:AVX2</flag>
+ <alignment>32</alignment>
+</arch>
+
+</grammar>
--- /dev/null
+<grammar>
+
+<machine name="generic">
+<archs>generic orc|</archs>
+</machine>
+
+<machine name="neon">
+<archs>generic neon softfp|hardfp orc|</archs>
+</machine>
+
+<!-- trailing | bar means generate without either for MSVC -->
+<machine name="sse2">
+<archs>generic 32|64| mmx| sse sse2 orc|</archs>
+</machine>
+
+<machine name="sse3">
+<archs>generic 32|64| mmx| sse sse2 sse3 orc|</archs>
+</machine>
+
+<machine name="ssse3">
+<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 orc|</archs>
+</machine>
+
+<machine name="sse4_a">
+<archs>generic 32|64| mmx| sse sse2 sse3 sse4_a popcount orc|</archs>
+</machine>
+
+<machine name="sse4_1">
+<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 orc|</archs>
+</machine>
+
+<machine name="sse4_2">
+<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount orc|</archs>
+</machine>
+
+<!-- trailing | bar means generate without either for MSVC -->
+<machine name="avx">
+<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx orc|</archs>
+</machine>
+
+<!-- trailing | bar means generate without either for MSVC -->
+<machine name="avx2">
+<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 orc|</archs>
+</machine>
+
+</grammar>
--- /dev/null
+#
+# Copyright 2012 Free Software Foundation, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+archs = list()
+arch_dict = dict()
+
+class arch_class:
+ def __init__(self, flags, checks, **kwargs):
+ for key, cast, failval in (
+ ('name', str, None),
+ ('environment', str, None),
+ ('include', str, None),
+ ('alignment', int, 1)
+ ):
+ try: setattr(self, key, cast(kwargs[key]))
+ except: setattr(self, key, failval)
+ self.checks = checks
+ assert(self.name)
+ self._flags = flags
+
+ def is_supported(self, compiler):
+ if not self._flags.keys(): return True
+ return compiler in self._flags.keys()
+
+ def get_flags(self, compiler):
+ try: return self._flags[compiler]
+ except KeyError: return list()
+
+ def __repr__(self): return self.name
+
+def register_arch(**kwargs):
+ arch = arch_class(**kwargs)
+ archs.append(arch)
+ arch_dict[arch.name] = arch
+
+########################################################################
+# register the arches
+########################################################################
+#TODO skip the XML and put it here
+from xml.dom import minidom
+import os
+gendir = os.path.dirname(__file__)
+archs_xml = minidom.parse(os.path.join(gendir, 'archs.xml')).getElementsByTagName('arch')
+for arch_xml in archs_xml:
+ kwargs = dict()
+ for attr in arch_xml.attributes.keys():
+ kwargs[attr] = arch_xml.attributes[attr].value
+ for node in arch_xml.childNodes:
+ try:
+ name = node.tagName
+ val = arch_xml.getElementsByTagName(name)[0].firstChild.data
+ kwargs[name] = val
+ except: pass
+ checks = list()
+ for check_xml in arch_xml.getElementsByTagName("check"):
+ name = check_xml.attributes["name"].value
+ params = list()
+ for param_xml in check_xml.getElementsByTagName("param"):
+ params.append(param_xml.firstChild.data)
+ checks.append([name, params])
+ flags = dict()
+ for flag_xml in arch_xml.getElementsByTagName("flag"):
+ name = flag_xml.attributes["compiler"].value
+ if not flags.has_key(name): flags[name] = list()
+ flags[name].append(flag_xml.firstChild.data)
+ #force kwargs keys to be of type str, not unicode for py25
+ kwargs = dict((str(k), v) for k, v in kwargs.iteritems())
+ register_arch(flags=flags, checks=checks, **kwargs)
+
+if __name__ == '__main__':
+ print archs
--- /dev/null
+#!/usr/bin/env python
+#
+# Copyright 2012 Free Software Foundation, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+import optparse
+import volk_arch_defs
+import volk_machine_defs
+
+def do_arch_flags_list(compiler):
+ output = list()
+ for arch in volk_arch_defs.archs:
+ if not arch.is_supported(compiler): continue
+ fields = [arch.name] + arch.get_flags(compiler)
+ output.append(','.join(fields))
+ print ';'.join(output)
+
+def do_machines_list(arch_names):
+ output = list()
+ for machine in volk_machine_defs.machines:
+ machine_arch_set = set(machine.arch_names)
+ if set(arch_names).intersection(machine_arch_set) == machine_arch_set:
+ output.append(machine.name)
+ print ';'.join(output)
+
+def do_machine_flags_list(compiler, machine_name):
+ output = list()
+ machine = volk_machine_defs.machine_dict[machine_name]
+ for arch in machine.archs:
+ output.extend(arch.get_flags(compiler))
+ print ' '.join(output)
+
+def main():
+ parser = optparse.OptionParser()
+ parser.add_option('--mode', type='string')
+ parser.add_option('--compiler', type='string')
+ parser.add_option('--archs', type='string')
+ parser.add_option('--machine', type='string')
+ (opts, args) = parser.parse_args()
+
+ if opts.mode == 'arch_flags': return do_arch_flags_list(opts.compiler.lower())
+ if opts.mode == 'machines': return do_machines_list(opts.archs.split(';'))
+ if opts.mode == 'machine_flags': return do_machine_flags_list(opts.compiler.lower(), opts.machine)
+
+if __name__ == '__main__': main()
--- /dev/null
+#
+# Copyright 2011-2012 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING. If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+#
+
+import os
+import re
+import sys
+import glob
+
+########################################################################
+# Strip comments from a c/cpp file.
+# Input is code string, output is code string without comments.
+# http://stackoverflow.com/questions/241327/python-snippet-to-remove-c-and-c-comments
+########################################################################
+def comment_remover(text):
+ def replacer(match):
+ s = match.group(0)
+ if s.startswith('/'):
+ return ""
+ else:
+ return s
+ pattern = re.compile(
+ r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
+ re.DOTALL | re.MULTILINE
+ )
+ return re.sub(pattern, replacer, text)
+
+########################################################################
+# Split code into nested sections according to ifdef preprocessor macros
+########################################################################
+def split_into_nested_ifdef_sections(code):
+ sections = list()
+ section = ''
+ header = 'text'
+ in_section_depth = 0
+ for i, line in enumerate(code.splitlines()):
+ m = re.match('^(\s*)#(\s*)(\w+)(.*)$', line)
+ line_is = 'normal'
+ if m:
+ p0, p1, fcn, stuff = m.groups()
+ if fcn in ('if', 'ifndef', 'ifdef'): line_is = 'if'
+ if fcn in ('else', 'elif'): line_is = 'else'
+ if fcn in ('endif',): line_is = 'end'
+
+ if line_is == 'if': in_section_depth += 1
+ if line_is == 'end': in_section_depth -= 1
+
+ if in_section_depth == 1 and line_is == 'if':
+ sections.append((header, section))
+ section = ''
+ header = line
+ continue
+
+ if in_section_depth == 1 and line_is == 'else':
+ sections.append((header, section))
+ section = ''
+ header = line
+ continue
+
+ if in_section_depth == 0 and line_is == 'end':
+ sections.append((header, section))
+ section = ''
+ header = 'text'
+ continue
+
+ section += line + '\n'
+
+ sections.append((header, section)) #and pack remainder into sections
+ sections = [sec for sec in sections if sec[1].strip()] #filter empty sections
+
+ #recurse into non-text sections to fill subsections
+ for i, (header, section) in enumerate(sections):
+ if header == 'text': continue
+ sections[i] = (header, split_into_nested_ifdef_sections(section))
+
+ return sections
+
+########################################################################
+# Recursive print of sections to test code above
+########################################################################
+def print_sections(sections, indent = ' '):
+ for header, body in sections:
+ if header == 'text':
+ print indent, ('\n'+indent).join(body.splitlines())
+ continue
+ print indent.replace(' ', '-') + '>', header
+ print_sections(body, indent + ' ')
+
+########################################################################
+# Flatten a section to just body text
+########################################################################
+def flatten_section_text(sections):
+ output = ''
+ for hdr, bdy in sections:
+ if hdr != 'text': output += flatten_section_text(bdy)
+ else: output += bdy
+ return output
+
+########################################################################
+# Extract kernel info from section, represent as an implementation
+########################################################################
+class impl_class:
+ def __init__(self, kern_name, header, body):
+ #extract LV_HAVE_*
+ self.deps = set(map(str.lower, re.findall('LV_HAVE_(\w+)', header)))
+ #extract function suffix and args
+ body = flatten_section_text(body)
+ try:
+ fcn_matcher = re.compile('^.*(%s\\w*)\\s*\\((.*)$'%kern_name, re.DOTALL | re.MULTILINE)
+ body = body.split('{')[0].rsplit(')', 1)[0] #get the part before the open ){ bracket
+ m = fcn_matcher.match(body)
+ impl_name, the_rest = m.groups()
+ self.name = impl_name.replace(kern_name+'_', '')
+ self.args = list()
+ fcn_args = the_rest.split(',')
+ for fcn_arg in fcn_args:
+ arg_matcher = re.compile('^\s*(.*\\W)\s*(\w+)\s*$', re.DOTALL | re.MULTILINE)
+ m = arg_matcher.match(fcn_arg)
+ arg_type, arg_name = m.groups()
+ self.args.append((arg_type, arg_name))
+ except Exception as ex:
+ raise Exception, 'I cant parse the function prototype from: %s in %s\n%s'%(kern_name, body, ex)
+
+ assert self.name
+ self.is_aligned = self.name.startswith('a_')
+
+ def __repr__(self):
+ return self.name
+
+########################################################################
+# Get sets of LV_HAVE_* from the code
+########################################################################
+def extract_lv_haves(code):
+ haves = list()
+ for line in code.splitlines():
+ if not line.strip().startswith('#'): continue
+ have_set = set(map(str.lower, re.findall('LV_HAVE_(\w+)', line)))
+ if have_set: haves.append(have_set)
+ return haves
+
+########################################################################
+# Represent a processing kernel, parse from file
+########################################################################
+class kernel_class:
+ def __init__(self, kernel_file):
+ self.name = os.path.splitext(os.path.basename(kernel_file))[0]
+ self.pname = self.name.replace('volk_', 'p_')
+ code = open(kernel_file, 'r').read()
+ code = comment_remover(code)
+ sections = split_into_nested_ifdef_sections(code)
+ self._impls = list()
+ for header, section in sections:
+ if 'ifndef' not in header.lower(): continue
+ for sub_hdr, body in section:
+ if 'if' not in sub_hdr.lower(): continue
+ if 'LV_HAVE_' not in sub_hdr: continue
+ self._impls.append(impl_class(
+ kern_name=self.name, header=sub_hdr, body=body,
+ ))
+ assert(self._impls)
+ self.has_dispatcher = False
+ for impl in self._impls:
+ if impl.name == 'dispatcher':
+ self._impls.remove(impl)
+ self.has_dispatcher = True
+ break
+ self.args = self._impls[0].args
+ self.arglist_types = ', '.join([a[0] for a in self.args])
+ self.arglist_full = ', '.join(['%s %s'%a for a in self.args])
+ self.arglist_names = ', '.join([a[1] for a in self.args])
+
+ def get_impls(self, archs):
+ archs = set(archs)
+ impls = list()
+ for impl in self._impls:
+ if impl.deps.intersection(archs) == impl.deps:
+ impls.append(impl)
+ return impls
+
+ def __repr__(self):
+ return self.name
+
+########################################################################
+# Extract information from the VOLK kernels
+########################################################################
+__file__ = os.path.abspath(__file__)
+srcdir = os.path.dirname(os.path.dirname(__file__))
+kernel_files = glob.glob(os.path.join(srcdir, "kernels", "volk", "*.h"))
+kernels = map(kernel_class, kernel_files)
+
+if __name__ == '__main__':
+ print kernels
--- /dev/null
+#
+# Copyright 2012 Free Software Foundation, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+from volk_arch_defs import arch_dict
+
+machines = list()
+machine_dict = dict()
+
+class machine_class:
+ def __init__(self, name, archs):
+ self.name = name
+ self.archs = list()
+ self.arch_names = list()
+ for arch_name in archs:
+ if not arch_name: continue
+ arch = arch_dict[arch_name]
+ self.archs.append(arch)
+ self.arch_names.append(arch_name)
+ self.alignment = max(map(lambda a: a.alignment, self.archs))
+
+ def __repr__(self): return self.name
+
+def register_machine(name, archs):
+ for i, arch_name in enumerate(archs):
+ if '|' in arch_name: #handle special arch names with the '|'
+ for arch_sub in arch_name.split('|'):
+ if arch_sub:
+ register_machine(name+'_'+arch_sub, archs[:i] + [arch_sub] + archs[i+1:])
+ else:
+ register_machine(name, archs[:i] + archs[i+1:])
+ return
+ machine = machine_class(name=name, archs=archs)
+ machines.append(machine)
+ machine_dict[machine.name] = machine
+
+########################################################################
+# register the machines
+########################################################################
+#TODO skip the XML and put it here
+from xml.dom import minidom
+import os
+gendir = os.path.dirname(__file__)
+machines_xml = minidom.parse(os.path.join(gendir, 'machines.xml')).getElementsByTagName('machine')
+for machine_xml in machines_xml:
+ kwargs = dict()
+ for attr in machine_xml.attributes.keys():
+ kwargs[attr] = machine_xml.attributes[attr].value
+ for node in machine_xml.childNodes:
+ try:
+ name = node.tagName
+ val = machine_xml.getElementsByTagName(name)[0].firstChild.data
+ kwargs[name] = val
+ except: pass
+ kwargs['archs'] = kwargs['archs'].split()
+ #force kwargs keys to be of type str, not unicode for py25
+ kwargs = dict((str(k), v) for k, v in kwargs.iteritems())
+ register_machine(**kwargs)
+
+if __name__ == '__main__':
+ print machines
--- /dev/null
+#!/usr/bin/env python
+#
+# Copyright 2012 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING. If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+#
+
+import os
+import re
+import sys
+import optparse
+import volk_arch_defs
+import volk_machine_defs
+import volk_kernel_defs
+from Cheetah import Template
+
+def __escape_pre_processor(code):
+ out = list()
+ for line in code.splitlines():
+ m = re.match('^(\s*)#(\s*)(\w+)(.*)$', line)
+ if m:
+ p0, p1, fcn, stuff = m.groups()
+ conly = fcn in ('include', 'define', 'ifdef', 'ifndef', 'endif', 'elif', 'pragma')
+ both = fcn in ('if', 'else')
+ istmpl = '$' in stuff
+ if 'defined' in stuff: istmpl = False
+ if conly or (both and not istmpl):
+ line = '%s\\#%s%s%s'%(p0, p1, fcn, stuff)
+ out.append(line)
+ return '\n'.join(out)
+
+def __parse_tmpl(_tmpl, **kwargs):
+ defs = {
+ 'archs': volk_arch_defs.archs,
+ 'arch_dict': volk_arch_defs.arch_dict,
+ 'machines': volk_machine_defs.machines,
+ 'machine_dict': volk_machine_defs.machine_dict,
+ 'kernels': volk_kernel_defs.kernels,
+ }
+ defs.update(kwargs)
+ _tmpl = __escape_pre_processor(_tmpl)
+ _tmpl = """
+
+/* this file was generated by volk template utils, do not edit! */
+
+""" + _tmpl
+ return str(Template.Template(_tmpl, defs))
+
+def main():
+ parser = optparse.OptionParser()
+ parser.add_option('--input', type='string')
+ parser.add_option('--output', type='string')
+ (opts, args) = parser.parse_args()
+
+ output = __parse_tmpl(open(opts.input).read(), args=args)
+ if opts.output: open(opts.output, 'w').write(output)
+ else: print output
+
+if __name__ == '__main__': main()
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2006,2009,2013 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_VOLK_CONSTANTS_H
+#define INCLUDED_VOLK_CONSTANTS_H
+
+#include <volk/volk_common.h>
+
+__VOLK_DECL_BEGIN
+
+VOLK_API char* volk_prefix();
+VOLK_API char* volk_version();
+VOLK_API char* volk_c_compiler();
+VOLK_API char* volk_compiler_flags();
+VOLK_API char* volk_available_machines();
+
+__VOLK_DECL_END
+
+#endif /* INCLUDED_VOLK_CONSTANTS_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2016 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+
+#ifndef INCLUDED_volk_saturation_arithmetic_H_
+#define INCLUDED_volk_saturation_arithmetic_H_
+
+#include <limits.h>
+
+static inline int16_t sat_adds16i(int16_t x, int16_t y)
+{
+ int32_t res = (int32_t) x + (int32_t) y;
+
+ if (res < SHRT_MIN) res = SHRT_MIN;
+ if (res > SHRT_MAX) res = SHRT_MAX;
+
+ return res;
+}
+
+static inline int16_t sat_muls16i(int16_t x, int16_t y)
+{
+ int32_t res = (int32_t) x * (int32_t) y;
+
+ if (res < SHRT_MIN) res = SHRT_MIN;
+ if (res > SHRT_MAX) res = SHRT_MAX;
+
+ return res;
+}
+
+#endif /* INCLUDED_volk_saturation_arithmetic_H_ */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2015 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * This file is intended to hold AVX intrinsics of intrinsics.
+ * They should be used in VOLK kernels to avoid copy-pasta.
+ */
+
+#ifndef INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
+#define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
+#include <immintrin.h>
+
+static inline __m256
+_mm256_complexmul_ps(__m256 x, __m256 y)
+{
+ __m256 yl, yh, tmp1, tmp2;
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
+ tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ...
+ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+ return _mm256_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+}
+
+static inline __m256
+_mm256_conjugate_ps(__m256 x){
+ const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+ return _mm256_xor_ps(x, conjugator); // conjugate y
+}
+
+static inline __m256
+_mm256_complexconjugatemul_ps(__m256 x, __m256 y){
+ y = _mm256_conjugate_ps(y);
+ return _mm256_complexmul_ps(x, y);
+}
+
+static inline __m256
+_mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2){
+ __m256 complex1, complex2;
+ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+ return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
+}
+
+static inline __m256
+_mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2){
+ return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
+}
+
+#endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */
--- /dev/null
+#ifndef INCLUDED_LIBVOLK_COMMON_H
+#define INCLUDED_LIBVOLK_COMMON_H
+
+////////////////////////////////////////////////////////////////////////
+// Cross-platform attribute macros
+////////////////////////////////////////////////////////////////////////
+#if defined __GNUC__
+# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
+# define __VOLK_ATTR_UNUSED __attribute__((unused))
+# define __VOLK_ATTR_INLINE __attribute__((always_inline))
+# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
+# if __GNUC__ >= 4
+# define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
+# define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
+# else
+# define __VOLK_ATTR_EXPORT
+# define __VOLK_ATTR_IMPORT
+# endif
+# define __VOLK_PREFETCH(addr) __builtin_prefetch(addr)
+#elif _MSC_VER
+# define __VOLK_ATTR_ALIGNED(x) __declspec(align(x))
+# define __VOLK_ATTR_UNUSED
+# define __VOLK_ATTR_INLINE __forceinline
+# define __VOLK_ATTR_DEPRECATED __declspec(deprecated)
+# define __VOLK_ATTR_EXPORT __declspec(dllexport)
+# define __VOLK_ATTR_IMPORT __declspec(dllimport)
+# define __VOLK_PREFETCH(addr)
+#else
+# define __VOLK_ATTR_ALIGNED(x)
+# define __VOLK_ATTR_UNUSED
+# define __VOLK_ATTR_INLINE
+# define __VOLK_ATTR_DEPRECATED
+# define __VOLK_ATTR_EXPORT
+# define __VOLK_ATTR_IMPORT
+# define __VOLK_PREFETCH(addr)
+#endif
+
+////////////////////////////////////////////////////////////////////////
+// Ignore annoying warnings in MSVC
+////////////////////////////////////////////////////////////////////////
+#if defined(_MSC_VER)
+# pragma warning(disable: 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data
+# pragma warning(disable: 4305) //'identifier' : truncation from 'type1' to 'type2'
+#endif
+
+////////////////////////////////////////////////////////////////////////
+// C-linkage declaration macros
+// FIXME: due to the usage of complex.h, require gcc for c-linkage
+////////////////////////////////////////////////////////////////////////
+#if defined(__cplusplus) && (__GNUC__)
+# define __VOLK_DECL_BEGIN extern "C" {
+# define __VOLK_DECL_END }
+#else
+# define __VOLK_DECL_BEGIN
+# define __VOLK_DECL_END
+#endif
+
+////////////////////////////////////////////////////////////////////////
+// Define VOLK_API for library symbols
+// http://gcc.gnu.org/wiki/Visibility
+////////////////////////////////////////////////////////////////////////
+#ifdef volk_EXPORTS
+# define VOLK_API __VOLK_ATTR_EXPORT
+#else
+# define VOLK_API __VOLK_ATTR_IMPORT
+#endif
+
+////////////////////////////////////////////////////////////////////////
+// The bit128 union used by some
+////////////////////////////////////////////////////////////////////////
+#include <inttypes.h>
+
+#ifdef LV_HAVE_SSE
+#ifdef _WIN32
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+#endif
+
+union bit128{
+ uint8_t i8[16];
+ uint16_t i16[8];
+ uint32_t i[4];
+ float f[4];
+ double d[2];
+
+ #ifdef LV_HAVE_SSE
+ __m128 float_vec;
+ #endif
+
+ #ifdef LV_HAVE_SSE2
+ __m128i int_vec;
+ __m128d double_vec;
+ #endif
+};
+
+union bit256{
+ uint8_t i8[32];
+ uint16_t i16[16];
+ uint32_t i[8];
+ float f[8];
+ double d[4];
+
+ #ifdef LV_HAVE_AVX
+ __m256 float_vec;
+ __m256i int_vec;
+ __m256d double_vec;
+ #endif
+};
+
+#define bit128_p(x) ((union bit128 *)(x))
+#define bit256_p(x) ((union bit256 *)(x))
+
+#endif /*INCLUDED_LIBVOLK_COMMON_H*/
--- /dev/null
+#ifndef INCLUDED_VOLK_COMPLEX_H
+#define INCLUDED_VOLK_COMPLEX_H
+
+/*!
+ * \brief Provide typedefs and operators for all complex types in C and C++.
+ *
+ * The typedefs encompass all signed integer and floating point types.
+ * Each operator function is intended to work across all data types.
+ * Under C++, these operators are defined as inline templates.
+ * Under C, these operators are defined as preprocessor macros.
+ * The use of macros makes the operators agnostic to the type.
+ *
+ * The following operator functions are defined:
+ * - lv_cmake - make a complex type from components
+ * - lv_creal - get the real part of the complex number
+ * - lv_cimag - get the imaginary part of the complex number
+ * - lv_conj - take the conjugate of the complex number
+ */
+
+#ifdef __cplusplus
+
+#include <complex>
+#include <stdint.h>
+
+typedef std::complex<int8_t> lv_8sc_t;
+typedef std::complex<int16_t> lv_16sc_t;
+typedef std::complex<int32_t> lv_32sc_t;
+typedef std::complex<int64_t> lv_64sc_t;
+typedef std::complex<float> lv_32fc_t;
+typedef std::complex<double> lv_64fc_t;
+
+template <typename T> inline std::complex<T> lv_cmake(const T &r, const T &i){
+ return std::complex<T>(r, i);
+}
+
+template <typename T> inline typename T::value_type lv_creal(const T &x){
+ return x.real();
+}
+
+template <typename T> inline typename T::value_type lv_cimag(const T &x){
+ return x.imag();
+}
+
+template <typename T> inline T lv_conj(const T &x){
+ return std::conj(x);
+}
+
+#else /* __cplusplus */
+
+#include <complex.h>
+
+typedef char complex lv_8sc_t;
+typedef short complex lv_16sc_t;
+typedef long complex lv_32sc_t;
+typedef long long complex lv_64sc_t;
+typedef float complex lv_32fc_t;
+typedef double complex lv_64fc_t;
+
+#define lv_cmake(r, i) ((r) + _Complex_I*(i))
+
+// When GNUC is available, use the complex extensions.
+// The extensions always return the correct value type.
+// http://gcc.gnu.org/onlinedocs/gcc/Complex.html
+#ifdef __GNUC__
+
+#define lv_creal(x) (__real__(x))
+
+#define lv_cimag(x) (__imag__(x))
+
+#define lv_conj(x) (~(x))
+
+// When not available, use the c99 complex function family,
+// which always returns double regardless of the input type.
+#else /* __GNUC__ */
+
+#define lv_creal(x) (creal(x))
+
+#define lv_cimag(x) (cimag(x))
+
+#define lv_conj(x) (conj(x))
+
+#endif /* __GNUC__ */
+
+#endif /* __cplusplus */
+
+#endif /* INCLUDE_VOLK_COMPLEX_H */
--- /dev/null
+/* -*- c -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_VOLK_MALLOC_H
+#define INCLUDED_VOLK_MALLOC_H
+
+#include <volk/volk_common.h>
+#include <stdlib.h>
+
+__VOLK_DECL_BEGIN
+
+/*!
+ * \brief Allocate \p size bytes of data aligned to \p alignment.
+ *
+ * \details
+ * Because we don't have a standard method to allocate buffers in
+ * memory that are guaranteed to be on an alignment, VOLK handles this
+ * itself. The volk_malloc function behaves like malloc in that it
+ * returns a pointer to the allocated memory. However, it also takes
+ * in an alignment specfication, which is usually something like 16 or
+ * 32 to ensure that the aligned memory is located on a particular
+ * byte boundary for use with SIMD.
+ *
+ * Internally, the volk_malloc first checks if the compiler is C11
+ * compliant and uses the new aligned_alloc method. If not, it checks
+ * if the system is POSIX compliant and uses posix_memalign. If that
+ * fails, volk_malloc handles the memory allocation and alignment
+ * internally.
+ *
+ * Because of the ways in which volk_malloc may allocate memory, it is
+ * important to always free volk_malloc pointers using volk_free.
+ *
+ * \param size The number of bytes to allocate.
+ * \param alignment The byte alignment of the allocated memory.
+ * \return pointer to aligned memory.
+ */
+VOLK_API void *volk_malloc(size_t size, size_t alignment);
+
+/*!
+ * \brief Free's memory allocated by volk_malloc.
+ * \param aptr The aligned pointer allocaed by volk_malloc.
+ */
+VOLK_API void volk_free(void *aptr);
+
+__VOLK_DECL_END
+
+#endif /* INCLUDED_VOLK_MALLOC_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2015 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * This file is intended to hold NEON intrinsics of intrinsics.
+ * They should be used in VOLK kernels to avoid copy-pasta.
+ */
+
+#ifndef INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
+#define INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
+#include <arm_neon.h>
+
+static inline float32x4_t
+_vmagnitudesquaredq_f32(float32x4x2_t cplxValue)
+{
+ float32x4_t iValue, qValue, result;
+ iValue = vmulq_f32(cmplxValue.val[0], cmplxValue.val[0]); // Square the values
+ qValue = vmulq_f32(cmplxValue.val[1], cmplxValue.val[1]); // Square the values
+
+ result = vaddq_f32(iValue, qValue); // Add the I2 and Q2 values
+ return result;
+}
+
+
+static inline float32x4x2_t
+_vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
+{
+ // multiply the real*real and imag*imag to get real result
+ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+ tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+ // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+ tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
+
+ // Multiply cross terms to get the imaginary result
+ // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+ tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
+ // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+
+ // combine the products
+ c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
+ c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
+ return c_val;
+}
+
+
+static inline float32x4_t
+_vlog2q_f32(float32x4_t aval)
+{
+ /* Calculate log2 of floats by taking exponent +
+ * minimax log2 approx of significand */
+ static int32x4_t one = vdupq_n_s32(0x000800000);
+ static /* minimax polynomial */
+ static float32x4_t p0 = vdupq_n_f32(-3.0400402727048585);
+ static float32x4_t p1 = vdupq_n_f32(6.1129631282966113);
+ static float32x4_t p2 = vdupq_n_f32(-5.3419892024633207);
+ static float32x4_t p3 = vdupq_n_f32(3.2865287703753912);
+ static float32x4_t p4 = vdupq_n_f32(-1.2669182593441635);
+ static float32x4_t p5 = vdupq_n_f32(0.2751487703421256);
+ static float32x4_t p6 = vdupq_n_f32(-0.0256910888150985);
+ static int32x4_t exp_mask = vdupq_n_s32(0x7f800000);
+ static int32x4_t sig_mask = vdupq_n_s32(0x007fffff);
+ static int32x4_t exp_bias = vdupq_n_s32(127);
+
+ int32x4_t exponent_i = vandq_s32(aval, exp_mask);
+ int32x4_t significand_i = vandq_s32(aval, sig_mask);
+ exponent_i = vshrq_n_s32(exponent_i, 23);
+
+ /* extract the exponent and significand
+ we can treat this as fixed point to save ~9% on the
+ conversion + float add */
+ significand_i = vorrq_s32(one, significand_i);
+ float32x4_t significand_f = vcvtq_n_f32_s32(significand_i,23);
+ /* debias the exponent and convert to float */
+ exponent_i = vsubq_s32(exponent_i, exp_bias);
+ float32x4_t exponent_f = vcvtq_f32_s32(exponent_i);
+
+ /* put the significand through a polynomial fit of log2(x) [1,2]
+ add the result to the exponent */
+ log2_approx = vaddq_f32(exponent_f, p0); /* p0 */
+ float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */
+ log2_approx = vaddq_f32(log2_approx, tmp1);
+ float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */
+ tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */
+ log2_approx = vaddq_f32(log2_approx, tmp1);
+
+ float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */
+ tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */
+ log2_approx = vaddq_f32(log2_approx, tmp1);
+ float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */
+ tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */
+ log2_approx = vaddq_f32(log2_approx, tmp1);
+ float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */
+ tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */
+ log2_approx = vaddq_f32(log2_approx, tmp1);
+ float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */
+ tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */
+ log2_approx = vaddq_f32(log2_approx, tmp1);
+
+ return log2_approx;
+}
+
+#endif /* INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ */
--- /dev/null
+#ifndef INCLUDED_VOLK_PREFS_H
+#define INCLUDED_VOLK_PREFS_H
+
+#include <volk/volk_common.h>
+#include <stdlib.h>
+
+__VOLK_DECL_BEGIN
+
+typedef struct volk_arch_pref
+{
+ char name[128]; //name of the kernel
+ char impl_a[128]; //best aligned impl
+ char impl_u[128]; //best unaligned impl
+} volk_arch_pref_t;
+
+////////////////////////////////////////////////////////////////////////
+// get path to volk_config profiling info;
+// returns \0 in the argument on failure.
+////////////////////////////////////////////////////////////////////////
+VOLK_API void volk_get_config_path(char *);
+
+////////////////////////////////////////////////////////////////////////
+// load prefs into global prefs struct
+////////////////////////////////////////////////////////////////////////
+VOLK_API size_t volk_load_preferences(volk_arch_pref_t **);
+
+__VOLK_DECL_END
+
+#endif //INCLUDED_VOLK_PREFS_H
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2015 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * This file is intended to hold SSE3 intrinsics of intrinsics.
+ * They should be used in VOLK kernels to avoid copy-pasta.
+ */
+
+#ifndef INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_
+#define INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_
+#include <pmmintrin.h>
+
+static inline __m128
+_mm_complexmul_ps(__m128 x, __m128 y)
+{
+ __m128 yl, yh, tmp1, tmp2;
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+ tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+ tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+ return _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+}
+
+static inline __m128
+_mm_complexconjugatemul_ps(__m128 x, __m128 y)
+{
+ const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+ y = _mm_xor_ps(y, conjugator); // conjugate y
+ return _mm_complexmul_ps(x, y);
+}
+
+static inline __m128
+_mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+ return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+}
+
+static inline __m128
+_mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){
+ return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2));
+}
+
+#endif /* INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2015 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * This file is intended to hold SSE intrinsics of intrinsics.
+ * They should be used in VOLK kernels to avoid copy-pasta.
+ */
+
+#ifndef INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
+#define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
+#include <xmmintrin.h>
+
+static inline __m128
+_mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2){
+ __m128 iValue, qValue;
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+ return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+}
+
+static inline __m128
+_mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2){
+ return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2));
+}
+
+#endif /* INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ */
--- /dev/null
+########################################################################
+# How to create custom kernel dispatchers
+########################################################################
+A kernel dispatcher is kernel implementation that calls other kernel implementations.
+By default, a dispatcher is generated by the build system for every kernel such that:
+ * the best aligned implemention is called when all pointer arguments are aligned,
+ * and otherwise the best unaligned implementation is called.
+
+The author of a VOLK kernel may create a custom dispatcher,
+to be called in place of the automatically generated one.
+A custom dispatcher may be useful to handle head and tail cases,
+or to implement different alignment and bounds checking logic.
+
+########################################################################
+# Code for an example dispatcher w/ tail case
+########################################################################
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_DISPATCHER
+
+static inline void volk_32f_x2_add_32f_dispatcher(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
+{
+ const unsigned int num_points_r = num_points%4;
+ const unsigned int num_points_x = num_points - num_points_r;
+
+ if (volk_is_aligned(VOLK_OR_PTR(cVector, VOLK_OR_PTR(aVector, bVector))))
+ {
+ volk_32f_x2_add_32f_a(cVector, aVector, bVector, num_points_x);
+ }
+ else
+ {
+ volk_32f_x2_add_32f_u(cVector, aVector, bVector, num_points_x);
+ }
+
+ volk_32f_x2_add_32f_g(cVector+num_points_x, aVector+num_points_x, bVector+num_points_x, num_points_r);
+}
+
+#endif //LV_HAVE_DISPATCHER
+
+########################################################################
+# Code for an example dispatcher w/ tail case and accumulator
+########################################################################
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_DISPATCHER
+
+static inline void volk_32f_x2_dot_prod_32f_dispatcher(float * result, const float * input, const float * taps, unsigned int num_points)
+{
+ const unsigned int num_points_r = num_points%16;
+ const unsigned int num_points_x = num_points - num_points_r;
+
+ if (volk_is_aligned(VOLK_OR_PTR(input, taps)))
+ {
+ volk_32f_x2_dot_prod_32f_a(result, input, taps, num_points_x);
+ }
+ else
+ {
+ volk_32f_x2_dot_prod_32f_u(result, input, taps, num_points_x);
+ }
+
+ float result_tail = 0;
+ volk_32f_x2_dot_prod_32f_g(&result_tail, input+num_points_x, taps+num_points_x, num_points_r);
+
+ *result += result_tail;
+}
+
+#endif //LV_HAVE_DISPATCHER
--- /dev/null
+@ static inline void volk_16i_max_star_horizontal_16i_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+ .global volk_16i_max_star_horizontal_16i_neonasm
+volk_16i_max_star_horizontal_16i_neonasm:
+ @ r0 - cVector: pointer to output array
+ @ r1 - aVector: pointer to input array 1
+ @ r2 - num_points: number of items to process
+
+volk_16i_max_star_horizontal_16i_neonasm:
+ pld [r1:128]
+ push {r4, r5, r6} @ preserve register states
+ lsrs r5, r2, #4 @ 1/16th points = num_points/16
+ vmov.i32 q12, #0 @ q12 = [0,0,0,0]
+ beq .smallvector @ less than 16 elements in vector
+ mov r4, r1 @ r4 = aVector
+ mov r12, r0 @ gcc calls this ip
+ mov r3, #0 @ number = 0
+
+.loop1:
+ vld2.16 {d16-d19}, [r4]! @ aVector, interleaved load
+ pld [r4:128]
+ add r3, r3, #1 @ number += 1
+ cmp r3, r5 @ number < 1/16th points
+ vsub.i16 q10, q8, q9 @ subtraction
+ vcge.s16 q11, q10, #0 @ result > 0?
+ vcgt.s16 q10, q12, q10 @ result < 0?
+ vand.i16 q11, q8, q11 @ multiply by comparisons
+ vand.i16 q10, q9, q10 @ multiply by other comparison
+ vadd.i16 q10, q11, q10 @ add results to get max
+ vst1.16 {d20-d21}, [r12]! @ store the results
+ bne .loop1 @ at least 16 items left
+ add r1, r1, r3, lsl #5
+ add r0, r0, r3, lsl #4
+.smallvector:
+ ands r2, r2, #15
+ beq .end
+ mov r3, #0
+.loop3:
+ ldrh r4, [r1]
+ bic r5, r3, #1
+ ldrh ip, [r1, #2]
+ add r3, r3, #2
+ add r1, r1, #4
+ rsb r6, ip, r4
+ sxth r6, r6
+ cmp r6, #0
+ movgt ip, r4
+ cmp r3, r2
+ strh ip, [r0, r5]
+ bcc .loop3
+.end:
+ pop {r4, r5, r6}
+ bx lr
--- /dev/null
+@ static inline void volk_32f_s32f_multiply_32f_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+ .global volk_32f_s32f_multiply_32f_neonasm
+volk_32f_s32f_multiply_32f_neonasm:
+ @ r0 - cVector: pointer to output array
+ @ r1 - aVector: pointer to input array 1
+ @ r2 - bVector: pointer to input array 2
+ @ r3 - num_points: number of items to process
+
+ stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12} @ prologue - save register states
+
+
+ @ quarter_points = num_points / 4
+ movs r11, r3, lsr #2
+ beq .loop2 @ if zero into quarterPoints
+
+ @ number = quarter_points
+ mov r10, r3
+ @ copy address of input vector
+ mov r4, r1
+ @ copy address of output vector
+ mov r5, r0
+
+ @ load the scalar to a quad register
+ @ vmov.32 d2[0], r2
+ @ The scalar might be in s0, not totally sure
+ vdup.32 q2, d0[0]
+
+ @ this is giving fits. Current theory is hf has something to do with it
+ .loop1:
+ @ vld1.32 {q1}, [r4:128]! @ aVal
+ @ vmul.f32 q3, q1, q2
+ @ vst1.32 {q3}, [r5:128]! @ cVal
+ @
+ @ subs r10, r10, #1
+ @ bne .loop1 @ first loop
+
+ @ number = quarter_points * 4
+ mov r10, r11, asl #2
+
+ .loop2:
+ @ cmp num_points, number
+ @ bls .done
+ @
+ @ vld1.32 {d0[0]}, [aVector]!
+ @ vmul.f32 s2, s0, s4
+ @ vst1.32 {d1[0]}, [cVector]!
+ @ add number, number, #1
+ @ b .loop2
+
+.done:
+ ldmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12} @ epilogue - restore register states
+ bx lr
--- /dev/null
+@ static inline void volk_32f_x2_add_32f_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+ .global volk_32f_x2_add_32f_neonasm
+volk_32f_x2_add_32f_neonasm:
+ @ r0 - cVector: pointer to output array
+ @ r1 - aVector: pointer to input array 1
+ @ r2 - bVector: pointer to input array 2
+ @ r3 - num_points: number of items to process
+ cVector .req r0
+ aVector .req r1
+ bVector .req r2
+ num_points .req r3
+ quarterPoints .req r7
+ number .req r8
+ aVal .req q0 @ d0-d1
+ bVal .req q1 @ d2-d3
+ cVal .req q2 @ d4-d5
+
+ @ AAPCS Section 5.1.1
+ @ A subroutine must preserve the contents of the registers r4-r8, r10, r11 and SP
+ stmfd sp!, {r7, r8, sl} @ prologue - save register states
+
+ movs quarterPoints, num_points, lsr #2
+ beq .loop2 @ if zero into quarterPoints
+
+ mov number, #0 @ number, 0
+.loop1:
+ pld [aVector, #128] @ pre-load hint - this is implementation specific!
+ pld [bVector, #128] @ pre-load hint - this is implementation specific!
+
+ vld1.32 {d0-d1}, [aVector:128]! @ aVal
+ add number, number, #1
+ vld1.32 {d2-d3}, [bVector:128]! @ bVal
+ vadd.f32 cVal, bVal, aVal
+ cmp number, quarterPoints
+ vst1.32 {d4-d5}, [cVector:128]! @ cVal
+
+ blt .loop1 @ first loop
+
+ mov number, quarterPoints, asl #2
+
+.loop2:
+ cmp num_points, number
+ bls .done
+
+ vld1.32 {d0[0]}, [aVector]!
+ vld1.32 {d0[1]}, [bVector]!
+ vadd.f32 s2, s1, s0
+ vst1.32 {d1[0]}, [cVector]!
+ add number, number, #1
+ b .loop2
+
+.done:
+ ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states
+ bx lr
--- /dev/null
+@ static inline void volk_32f_x2_add_32f_neonpipeline(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+ .global volk_32f_x2_add_32f_neonpipeline
+volk_32f_x2_add_32f_neonpipeline:
+ @ r0 - cVector: pointer to output array
+ @ r1 - aVector: pointer to input array 1
+ @ r2 - bVector: pointer to input array 2
+ @ r3 - num_points: number of items to process
+ cVector .req r0
+ aVector .req r1
+ bVector .req r2
+ num_points .req r3
+ quarterPoints .req r7
+ number .req r8
+ aVal .req q0 @ d0-d1
+ bVal .req q1 @ d2-d3
+ cVal .req q2 @ d4-d5
+
+ stmfd sp!, {r7, r8, sl} @ prologue - save register states
+
+ pld [aVector, #128] @ pre-load hint - this is implementation specific!
+ pld [bVector, #128] @ pre-load hint - this is implementation specific!
+
+ movs quarterPoints, num_points, lsr #2
+ beq .loop2 @ if zero into quarterPoints
+
+ mov number, quarterPoints
+
+ @ Optimizing for pipeline
+ vld1.32 {d0-d1}, [aVector:128]! @ aVal
+ vld1.32 {d2-d3}, [bVector:128]! @ bVal
+ subs number, number, #1
+ beq .flushpipe
+
+.loop1:
+ pld [aVector, #128] @ pre-load hint - this is implementation specific!
+ pld [bVector, #128] @ pre-load hint - this is implementation specific!
+ vadd.f32 cVal, bVal, aVal
+ vld1.32 {d0-d1}, [aVector:128]! @ aVal
+ vld1.32 {d2-d3}, [bVector:128]! @ bVal
+ vst1.32 {d4-d5}, [cVector:128]! @ cVal
+
+ subs number, number, #1
+ bne .loop1 @ first loop
+
+.flushpipe:
+ @ One more time
+ vadd.f32 cVal, bVal, aVal
+ vst1.32 {d4-d5}, [cVector:128]! @ cVal
+
+ mov number, quarterPoints, asl #2
+
+.loop2:
+ cmp num_points, number
+ bls .done
+
+ vld1.32 {d0[0]}, [aVector]!
+ vld1.32 {d0[1]}, [bVector]!
+ vadd.f32 s2, s1, s0
+ vst1.32 {d1[0]}, [cVector]!
+ add number, number, #1
+ b .loop2
+
+.done:
+ ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states
+ bx lr
--- /dev/null
+@ static inline void volk_32f_x2_dot_prod_32f_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+ .global volk_32f_x2_dot_prod_32f_neonasm
+volk_32f_x2_dot_prod_32f_neonasm:
+ @ r0 - cVector: pointer to output array
+ @ r1 - aVector: pointer to input array 1
+ @ r2 - bVector: pointer to input array 2
+ @ r3 - num_points: number of items to process
+ cVector .req r0
+ aVector .req r1
+ bVector .req r2
+ num_points .req r3
+ quarterPoints .req r7
+ number .req r8
+ aVal .req q0 @ d0-d1
+ bVal .req q1 @ d2-d3
+ cVal .req q2 @ d4-d5
+
+ @ AAPCS Section 5.1.1
+ @ A subroutine must preserve the contents of the registers r4-r8, r10, r11 and SP
+ stmfd sp!, {r7, r8, sl} @ prologue - save register states
+
+ veor.32 q0, q0, q0
+ movs quarterPoints, num_points, lsr #2
+ beq .loop2 @ if zero into quarterPoints
+
+ mov number, #0 @ number, 0
+.loop1:
+ pld [aVector, #128] @ pre-load hint - this is implementation specific!
+ pld [bVector, #128] @ pre-load hint - this is implementation specific!
+
+ vld1.32 {q1}, [aVector:128]! @ aVal
+ vld1.32 {q2}, [bVector:128]! @ bVal
+ vmla.f32 q0, q1, q2
+
+ add number, number, #1
+ cmp number, quarterPoints
+ blt .loop1 @ first loop
+
+ @ strange order comes from trying to schedule instructions
+ vadd.f32 s0, s0, s1
+ vadd.f32 s2, s2, s3
+ mov number, quarterPoints, asl #2
+ vadd.f32 s0, s0, s2
+
+.loop2:
+ cmp num_points, number
+ bls .done
+
+ vld1.32 {d1[0]}, [aVector]!
+ vld1.32 {d1[1]}, [bVector]!
+ vmla.f32 s0, s2, s3
+ add number, number, #1
+ b .loop2
+
+.done:
+ vstr s0, [cVector]
+ ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states
+ bx lr
--- /dev/null
+@ static inline void volk_32f_x2_dot_prod_32f_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+ @ r0 = cVector
+ @ r1 = aVector
+ @ r2 = bVector
+ @ r3 = num_points
+ .global volk_32f_x2_dot_prod_32f_neonasm_opts
+volk_32f_x2_dot_prod_32f_neonasm_opts:
+ push {r4, r5, r6, r7, r8, r9, r10, r11}
+ @ sixteenth_points = num_points / 16
+ lsrs r8, r3, #4
+ sub r13, r13, #16 @ subtracting 16 from stack pointer?, wat?
+ @ 0 out neon accumulators
+ veor q0, q3, q3
+ veor q1, q3, q3
+ veor q2, q3, q3
+ veor q3, q3, q3
+ beq .smallvector @ if less than 16 points skip main loop
+ mov r7, r2 @ copy input ptrs
+ mov r6, r1 @ copy input ptrs
+ mov r5, #0 @ loop counter
+.mainloop:
+ vld4.32 {d16,d18,d20,d22}, [r6]!
+ add r5, r5, #1 @ inc loop counter
+ cmp r5, r8 @ loop counter < sixteenth_points?
+ vld4.32 {d24,d26,d28,d30}, [r7]!
+ vld4.32 {d17,d19,d21,d23}, [r6]!
+ vld4.32 {d25,d27,d29,d31}, [r7]!
+ vmla.f32 q3, q8, q12
+ vmla.f32 q0, q13, q9
+ vmla.f32 q1, q14, q10
+ vmla.f32 q2, q15, q11
+ bne .mainloop
+ lsl r12, r8, #6 @ r12=r8/64
+ add r1, r1, r12
+ add r2, r2, r12
+.smallvector: @ actually this can be skipped for small vectors
+ vadd.f32 q3, q3, q0
+ lsl r8, r8, #4 @ sixteenth_points * 16
+ cmp r3, r8 @ num_points < sixteenth_points*16?
+ vadd.f32 q2, q1, q2
+ vadd.f32 q3, q2, q3 @ sum of 4 accumulators in to q3
+ vadd.f32 s15, s12, s15 @ q3 is s12-s15, so reduce to a single float
+ vadd.f32 s15, s15, s13
+ vadd.f32 s15, s15, s14
+ bls .done @ if vector is multiple of 16 then finish
+ sbfx r11, r1, #2, #1 @ check alignment
+ rsb r9, r8, r3
+ and r11, r11, #3
+ mov r6, r1
+ cmp r11, r9
+ movcs r11, r9
+ cmp r9, #3
+ movls r11, r9
+ cmp r11, #0
+ beq .nothingtodo
+ mov r5, r2
+ mov r12, r8
+.dlabel5:
+ add r12, r12, #1
+ vldmia r6!, {s14}
+ rsb r4, r8, r12
+ vldmia r5!, {s13}
+ cmp r4, r11
+ vmla.f32 s15, s13, s14
+ mov r7, r6
+ mov r4, r5
+ bcc .dlabel5
+ cmp r9, r11
+ beq .done
+.dlabel8:
+ rsb r9, r11, r9
+ lsr r8, r9, #2
+ lsls r10, r8, #2
+ beq .dlabel6
+ lsl r6, r11, #2
+ veor q8, q8, q8
+ add r1, r1, r6
+ add r6, r2, r6
+ mov r5, #0
+.dlabel9:
+ add r5, r5, #1
+ vld1.32 {d20-d21}, [r6]!
+ cmp r5, r8
+ vld1.64 {d18-d19}, [r1 :64]!
+ vmla.f32 q8, q10, q9
+ bcc .dlabel9
+ vadd.f32 d16, d16, d17
+ lsl r2, r10, #2
+ veor q9, q9, q9
+ add r7, r7, r2
+ vpadd.f32 d6, d16, d16
+ add r4, r4, r2
+ cmp r9, r10
+ add r12, r12, r10
+ vadd.f32 s15, s15, s12
+ beq .done
+.dlabel6:
+ mov r2, r7
+.dlabel7:
+ add r12, r12, #1
+ vldmia r2!, {s13}
+ cmp r3, r12
+ vldmia r4!, {s14}
+ vmla.f32 s15, s13, s14
+ bhi .dlabel7
+.done:
+ vstr s15, [r0]
+ add r13, r13, #16
+ pop {r4, r5, r6, r7, r8, r9, r10, r11}
+ bx lr @ lr is the return address
+.nothingtodo:
+ mov r12, r8
+ mov r4, r2
+ mov r7, r1
+ b .dlabel8
+
--- /dev/null
+@ static inline void volk_32fc_32f_dot_prod_32fc_a_neonasm ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
+ .global volk_32fc_32f_dot_prod_32fc_a_neonasm
+ volk_32fc_32f_dot_prod_32fc_a_neonasm:
+ @ r0 - result: pointer to output array (32fc)
+ @ r1 - input: pointer to input array 1 (32fc)
+ @ r2 - taps: pointer to input array 2 (32f)
+ @ r3 - num_points: number of items to process
+
+ result .req r0
+ input .req r1
+ taps .req r2
+ num_points .req r3
+ quarterPoints .req r7
+ number .req r8
+ @ Note that according to the ARM EABI (AAPCS) Section 5.1.1:
+ @ registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls;
+ @ registers s0-s15 (d0-d7, q0-q3) do not need to be preserved
+ @ registers d16-d31 (q8-q15), if present, do not need to be preserved.
+ realAccQ .req q0 @ d0-d1/s0-s3
+ compAccQ .req q1 @ d2-d3/s4-s7
+ realAccS .req s0 @ d0[0]
+ compAccS .req s4 @ d2[0]
+ tapsVal .req q2 @ d4-d5
+ outVal .req q3 @ d6-d7
+ realMul .req q8 @ d8-d9
+ compMul .req q9 @ d16-d17
+ inRealVal .req q10 @ d18-d19
+ inCompVal .req q11 @ d20-d21
+
+ stmfd sp!, {r7, r8, sl} @ prologue - save register states
+
+ veor realAccQ, realAccQ @ zero out accumulators
+ veor compAccQ, compAccQ @ zero out accumulators
+ movs quarterPoints, num_points, lsr #2
+ beq .loop2 @ if zero into quarterPoints
+
+ mov number, quarterPoints
+
+.loop1:
+ @ do work here
+ @pld [taps, #128] @ pre-load hint - this is implementation specific!
+ @pld [input, #128] @ pre-load hint - this is implementation specific!
+ vld1.32 {d4-d5}, [taps:128]! @ tapsVal
+ vld2.32 {d20-d23}, [input:128]! @ inRealVal, inCompVal
+ vmul.f32 realMul, tapsVal, inRealVal
+ vmul.f32 compMul, tapsVal, inCompVal
+ vadd.f32 realAccQ, realAccQ, realMul
+ vadd.f32 compAccQ, compAccQ, compMul
+ subs number, number, #1
+ bne .loop1 @ first loop
+
+ @ Sum up across realAccQ and compAccQ
+ vpadd.f32 d0, d0, d1 @ realAccQ +-> d0
+ vpadd.f32 d2, d2, d3 @ compAccQ +-> d2
+ vadd.f32 realAccS, s0, s1 @ sum the contents of d0 together (realAccQ)
+ vadd.f32 compAccS, s4, s5 @ sum the contents of d2 together (compAccQ)
+ @ critical values are now in s0 (realAccS), s4 (realAccQ)
+ mov number, quarterPoints, asl #2
+
+.loop2:
+ cmp num_points, number
+ bls .done
+
+ vld1.32 {d4[0]}, [taps]! @ s8
+ vld2.32 {d5[0],d6[0]}, [input]! @ s10, s12
+ vmul.f32 s5, s8, s10
+ vmul.f32 s6, s8, s12
+ vadd.f32 realAccS, realAccS, s5
+ vadd.f32 compAccS, compAccS, s6
+
+ add number, number, #1
+ b .loop2
+
+.done:
+ vst1.32 {d0[0]}, [result]! @ realAccS
+ vst1.32 {d2[0]}, [result] @ compAccS
+
+ ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states
+ bx lr
--- /dev/null
+@ static inline void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points)
+ .global volk_32fc_32f_dot_prod_32fc_a_neonasmvmla
+volk_32fc_32f_dot_prod_32fc_a_neonasmvmla:
+ @ r0 - result: pointer to output array (32fc)
+ @ r1 - input: pointer to input array 1 (32fc)
+ @ r2 - taps: pointer to input array 2 (32f)
+ @ r3 - num_points: number of items to process
+
+ result .req r0
+ input .req r1
+ taps .req r2
+ num_points .req r3
+ quarterPoints .req r7
+ number .req r8
+ @ Note that according to the ARM EABI (AAPCS) Section 5.1.1:
+ @ registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls;
+ @ registers s0-s15 (d0-d7, q0-q3) do not need to be preserved
+ @ registers d16-d31 (q8-q15), if present, do not need to be preserved.
+ realAccQ .req q0 @ d0-d1/s0-s3
+ compAccQ .req q1 @ d2-d3/s4-s7
+ realAccS .req s0 @ d0[0]
+ compAccS .req s4 @ d2[0]
+ tapsVal .req q2 @ d4-d5
+ outVal .req q3 @ d6-d7
+ realMul .req q8 @ d8-d9
+ compMul .req q9 @ d16-d17
+ inRealVal .req q10 @ d18-d19
+ inCompVal .req q11 @ d20-d21
+
+ stmfd sp!, {r7, r8, sl} @ prologue - save register states
+
+ veor realAccQ, realAccQ @ zero out accumulators
+ veor compAccQ, compAccQ @ zero out accumulators
+ movs quarterPoints, num_points, lsr #2
+ beq .loop2 @ if zero into quarterPoints
+
+ mov number, quarterPoints
+
+.loop1:
+ @ do work here
+ pld [taps, #128] @ pre-load hint - this is implementation specific!
+ pld [input, #128] @ pre-load hint - this is implementation specific!
+ vld1.32 {tapsVal}, [taps:128]! @ tapsVal
+ vld2.32 {inRealVal-inCompVal}, [input:128]! @ inRealVal, inCompVal
+ vmla.f32 realAccQ, tapsVal, inRealVal
+ vmla.f32 compAccQ, tapsVal, inCompVal
+ subs number, number, #1
+ bne .loop1 @ first loop
+
+ @ Sum up across realAccQ and compAccQ
+ vadd.f32 d0, d0, d1 @ realAccQ +-> d0
+ vadd.f32 d2, d2, d3 @ compAccQ +-> d2
+ vadd.f32 realAccS, s0, s1 @ sum the contents of d0 together (realAccQ)
+ vadd.f32 compAccS, s4, s5 @ sum the contents of d2 together (compAccQ)
+ @ critical values are now in s0 (realAccS), s4 (compAccS)
+ mov number, quarterPoints, asl #2
+.loop2:
+ cmp num_points, number
+ bls .done
+
+ vld1.32 {d4[0]}, [taps]! @ s8
+ vld2.32 {d5[0],d6[0]}, [input]! @ s10, s12
+ vmla.f32 realAccS, s8, s10 @ d0[0]
+ vmla.f32 compAccS, s8, s12 @ d2[0]
+
+ add number, number, #1
+ b .loop2
+
+.done:
+ vst1.32 {d0[0]}, [result]! @ realAccS
+ vst1.32 {d2[0]}, [result] @ compAccS
+
+ ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states
+ bx lr
--- /dev/null
+@ static inline void volk_32fc_32f_dot_prod_32fc_a_neonpipeline ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
+ .global volk_32fc_32f_dot_prod_32fc_a_neonpipeline
+volk_32fc_32f_dot_prod_32fc_a_neonpipeline:
+ @ r0 - result: pointer to output array (32fc)
+ @ r1 - input: pointer to input array 1 (32fc)
+ @ r2 - taps: pointer to input array 2 (32f)
+ @ r3 - num_points: number of items to process
+
+ result .req r0
+ input .req r1
+ taps .req r2
+ num_points .req r3
+ quarterPoints .req r7
+ number .req r8
+ @ Note that according to the ARM EABI (AAPCS) Section 5.1.1:
+ @ registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls;
+ @ registers s0-s15 (d0-d7, q0-q3) do not need to be preserved
+ @ registers d16-d31 (q8-q15), if present, do not need to be preserved.
+ realAccQ .req q0 @ d0-d1/s0-s3
+ compAccQ .req q1 @ d2-d3/s4-s7
+ realAccS .req s0 @ d0[0]
+ compAccS .req s4 @ d2[0]
+ tapsVal .req q2 @ d4-d5
+ outVal .req q3 @ d6-d7
+ realMul .req q8 @ d8-d9
+ compMul .req q9 @ d16-d17
+ inRealVal .req q10 @ d18-d19
+ inCompVal .req q11 @ d20-d21
+
+ stmfd sp!, {r7, r8, sl} @ prologue - save register states
+
+ pld [taps, #128] @ pre-load hint - this is implementation specific!
+ pld [input, #128] @ pre-load hint - this is implementation specific!
+
+ veor realAccQ, realAccQ @ zero out accumulators
+ veor compAccQ, compAccQ @ zero out accumulators
+ movs quarterPoints, num_points, lsr #2
+ beq .loop2 @ if zero into quarterPoints
+
+ @mov number, quarterPoints
+ mov number, #0
+ @ Optimizing for pipeline
+ vld1.32 {tapsVal}, [taps:128]! @ tapsVal
+ vld2.32 {inRealVal-inCompVal}, [input:128]! @ inRealVal, inCompVal
+ add number, number, #1
+
+.loop1:
+ @ do work here
+ pld [taps, #128] @ pre-load hint - this is implementation specific!
+ pld [input, #128] @ pre-load hint - this is implementation specific!
+ vmul.f32 realMul, tapsVal, inRealVal
+ vmul.f32 compMul, tapsVal, inCompVal
+ vadd.f32 realAccQ, realAccQ, realMul
+ vadd.f32 compAccQ, compAccQ, compMul
+ vld1.32 {tapsVal}, [taps:128]! @ tapsVal
+ vld2.32 {inRealVal-inCompVal}, [input:128]! @ inRealVal, inCompVal
+
+ @subs number, number, #1
+ @bls .loop1 @ first loop
+ add number, number, #1
+ cmp number, quarterPoints
+ blt .loop1
+
+ vmul.f32 realMul, tapsVal, inRealVal
+ vmul.f32 compMul, tapsVal, inCompVal
+ vadd.f32 realAccQ, realAccQ, realMul
+ vadd.f32 compAccQ, compAccQ, compMul
+
+ @ Sum up across realAccQ and compAccQ
+ vadd.f32 d0, d0, d1 @ realAccQ +-> d0
+ vadd.f32 d2, d2, d3 @ compAccQ +-> d2
+ vadd.f32 realAccS, s0, s1 @ sum the contents of d0 together (realAccQ)
+ vadd.f32 compAccS, s4, s5 @ sum the contents of d2 together (compAccQ)
+
+ @ critical values are now in s0 (realAccS), s4 (realAccQ)
+ mov number, quarterPoints, asl #2
+ cmp num_points, number
+ beq .done
+
+.loop2:
+ vld1.32 {d4[0]}, [taps]! @ s8
+ vld2.32 {d5[0],d6[0]}, [input]! @ s10, s12
+ vmul.f32 s5, s8, s10
+ vmul.f32 s6, s8, s12
+ vadd.f32 realAccS, realAccS, s5
+ vadd.f32 compAccS, compAccS, s6
+
+ add number, number, #1
+ cmp number, num_points
+ blt .loop2
+
+.done:
+ vst1.32 {d0[0]}, [result]! @ realAccS
+ vst1.32 {d2[0]}, [result] @ compAccS
+
+ ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states
+ bx lr
--- /dev/null
+@ static inline void volk_32fc_32f_dot_prod_32fc_unrollasm ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points)
+.global volk_32fc_32f_dot_prod_32fc_unrollasm
+volk_32fc_32f_dot_prod_32fc_unrollasm:
+ @ r0 - result: pointer to output array (32fc)
+ @ r1 - input: pointer to input array 1 (32fc)
+ @ r2 - taps: pointer to input array 2 (32f)
+ @ r3 - num_points: number of items to process
+
+ push {r4, r5, r6, r7, r8, r9}
+ vpush {q4-q7}
+ sub r13, r13, #56 @ 0x38
+ add r12, r13, #8
+ lsrs r8, r3, #3
+ veor.32 q2, q5, q5
+ veor.32 q3, q5, q5
+ veor.32 q4, q5, q5
+ veor.32 q5, q5, q5
+ beq .smallvector
+ vld2.32 {d20-d23}, [r1]!
+ vld1.32 {d24-d25}, [r2]!
+ mov r5, #1
+
+
+
+.mainloop:
+ vld2.32 {d14-d17}, [r1]! @ q7,q8
+ vld1.32 {d18-d19}, [r2]! @ q9
+
+ vmul.f32 q0, q12, q10 @ real mult
+ vmul.f32 q1, q12, q11 @ imag mult
+
+ add r5, r5, #1
+ cmp r5, r8
+
+ vadd.f32 q4, q4, q0@ q4 accumulates real
+ vadd.f32 q5, q5, q1@ q5 accumulates imag
+
+ vld2.32 {d20-d23}, [r1]! @ q10-q11
+ vld1.32 {d24-d25}, [r2]! @ q12
+
+ vmul.f32 q13, q9, q7
+ vmul.f32 q14, q9, q8
+ vadd.f32 q2, q2, q13 @ q2 accumulates real
+ vadd.f32 q3, q3, q14 @ q3 accumulates imag
+
+
+
+ bne .mainloop
+
+ vmul.f32 q0, q12, q10 @ real mult
+ vmul.f32 q1, q12, q11 @ imag mult
+
+ vadd.f32 q4, q4, q0@ q4 accumulates real
+ vadd.f32 q5, q5, q1@ q5 accumulates imag
+
+
+.smallvector:
+ vadd.f32 q0, q2, q4
+ add r12, r13, #24
+ lsl r8, r8, #3
+ vadd.f32 q1, q3, q5
+ cmp r3, r8
+
+ vadd.f32 d0, d0, d1
+ vadd.f32 d1, d2, d3
+ vadd.f32 s14, s0, s1
+ vadd.f32 s15, s2, s3
+
+ vstr s14, [r13]
+ vstr s15, [r13, #4]
+ bls .D1
+ rsb r12, r8, r3
+ lsr r4, r12, #2
+ cmp r4, #0
+ cmpne r12, #3
+ lsl r5, r4, #2
+ movhi r6, #0
+ movls r6, #1
+ bls .L1
+ vmov.i32 q10, #0 @ 0x00000000
+ mov r9, r1
+ mov r7, r2
+ vorr q11, q10, q10
+
+.smallloop:
+ add r6, r6, #1
+ vld2.32 {d16-d19}, [r9]!
+ cmp r4, r6
+ vld1.32 {d24-d25}, [r7]!
+ vmla.f32 q11, q12, q8
+ vmla.f32 q10, q12, q9
+ bhi .smallloop
+ vmov.i32 q9, #0 @ 0x00000000
+ cmp r12, r5
+ vadd.f32 d20, d20, d21
+ add r8, r8, r5
+ vorr q8, q9, q9
+ add r1, r1, r5, lsl #3
+ vadd.f32 d22, d22, d23
+ add r2, r2, r5, lsl #2
+ vpadd.f32 d18, d20, d20
+ vpadd.f32 d16, d22, d22
+ vmov.32 r4, d18[0]
+ vmov.32 r12, d16[0]
+ vmov s13, r4
+ vadd.f32 s15, s13, s15
+ vmov s13, r12
+ vadd.f32 s14, s13, s14
+ beq .finishreduction
+ .L1:
+ add r12, r8, #1
+ vldr s13, [r2]
+ cmp r3, r12
+ vldr s11, [r1]
+ vldr s12, [r1, #4]
+ vmla.f32 s14, s13, s11
+ vmla.f32 s15, s13, s12
+ bls .finishreduction
+ add r8, r8, #2
+ vldr s13, [r2, #4]
+ cmp r3, r8
+ vldr s11, [r1, #8]
+ vldr s12, [r1, #12]
+ vmla.f32 s14, s13, s11
+ vmla.f32 s15, s13, s12
+ bls .finishreduction
+ vldr s13, [r2, #8]
+ vldr s11, [r1, #16]
+ vldr s12, [r1, #20]
+ vmla.f32 s14, s13, s11
+ vmla.f32 s15, s13, s12
+
+.finishreduction:
+ vstr s14, [r13]
+ vstr s15, [r13, #4]
+ .D1:
+ ldr r3, [r13]
+ str r3, [r0]
+ ldr r3, [r13, #4]
+ str r3, [r0, #4]
+ add r13, r13, #56 @ 0x38
+ vpop {q4-q7}
+ pop {r4, r5, r6, r7, r8, r9}
+ bx r14
+
+
--- /dev/null
+@ static inline void volk_32fc_x2_dot_prod_32fc_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+ .global volk_32fc_x2_dot_prod_32fc_neonasm
+volk_32fc_x2_dot_prod_32fc_neonasm:
+ push {r4, r5, r6, r7, r8, lr}
+ vpush {q0-q7}
+ vpush {q8-q15}
+ mov r8, r3 @ hold on to num_points (r8)
+ @ zero out accumulators -- leave 1 reg in alu
+ veor q8, q15, q15
+ mov r7, r0 @ (r7) is cVec
+ veor q9, q15, q15
+ mov r5, r1 @ (r5) is aVec
+ veor q10, q15, q15
+ mov r6, r2 @ (r6) is bVec
+ veor q11, q15, q15
+ lsrs r3, r3, #3 @ eighth_points (r3) = num_points/8
+ veor q12, q15, q15
+ mov r12, r2 @ (r12) is bVec
+ veor q13, q15, q15
+ mov r4, r1 @ (r4) is aVec
+ veor q14, q15, q15
+ veor q15, q15, q15
+ beq .smallvector @ nathan optimized this file based on an objdump
+ @ but I don't understand this jump. Seems like it should go to loop2
+ @ and smallvector (really vector reduction) shouldn't need to be a label
+ mov r2, #0 @ 0 out r2 (now number)
+.loop1:
+ add r2, r2, #1 @ increment number
+ vld4.32 {d0,d2,d4,d6}, [r12]! @ q0-q3
+ cmp r2, r3 @ is number < eighth_points
+ @pld [r12, #64]
+ vld4.32 {d8,d10,d12,d14}, [r4]! @ q4-q7
+ @pld [r4, #64]
+ vmla.f32 q12, q4, q0 @ real (re*re)
+ vmla.f32 q14, q4, q1 @ imag (re*im)
+ vmls.f32 q15, q5, q1 @ real (im*im)
+ vmla.f32 q13, q5, q0 @ imag (im*re)
+
+ vmla.f32 q8, q2, q6 @ real (re*re)
+ vmla.f32 q9, q2, q7 @ imag (re*im)
+ vmls.f32 q10, q3, q7 @ real (im*im)
+ vmla.f32 q11, q3, q6 @ imag (im*re)
+ bne .loop1
+ lsl r2, r3, #3 @ r2 = eighth_points * 8
+ add r6, r6, r2 @ bVec = bVec + eighth_points -- whyyyyy gcc?!?
+ add r5, r5, r2 @ aVec = aVec + eighth_points
+ @ q12-q13 were original real accumulators
+ @ q14-q15 were original imag accumulators
+ @ reduce 8 accumulators down to 2 (1 real, 1 imag)
+ vadd.f32 q8, q10, q8 @ real + real
+ vadd.f32 q11, q11, q9 @ imag + imag
+ vadd.f32 q12, q12, q15 @ real + real
+ vadd.f32 q14, q14, q13 @ imag + imag
+ vadd.f32 q8, q8, q12
+ vadd.f32 q9, q9, q14
+.smallvector:
+ lsl r4, r3, #3
+ cmp r8, r4
+ vst2.32 {d16-d19}, [sp :64] @ whaaaaat? no way this is necessary!
+ vldr s15, [sp, #8]
+ vldr s17, [sp]
+ vldr s16, [sp, #4]
+ vadd.f32 s17, s17, s15
+ vldr s11, [sp, #12]
+ vldr s12, [sp, #24]
+ vldr s13, [sp, #28]
+ vldr s14, [sp, #16]
+ vldr s15, [sp, #20]
+ vadd.f32 s16, s16, s11
+ vadd.f32 s17, s17, s12
+ vadd.f32 s16, s16, s13
+ vadd.f32 s17, s17, s14
+ vadd.f32 s16, s16, s15
+ vstr s17, [r7]
+ vstr s16, [r7, #4]
+ bls .done
+.loop2:
+ mov r3, r6
+ add r6, r6, #8
+ vldr s0, [r3]
+ vldr s1, [r6, #-4]
+ mov r3, r5
+ add r5, r5, #8
+ vldr s2, [r3]
+ vldr s3, [r5, #-4]
+ bl __mulsc3 @ GCC/Clang built-in. Portability?
+ add r4, r4, #1
+ cmp r4, r8
+ vadd.f32 s17, s17, s0
+ vadd.f32 s16, s16, s1
+ vstr s17, [r7]
+ vstr s16, [r7, #4]
+ bne .loop2
+.done:
+ vpop {q8-q15}
+ vpop {q0-q7}
+ pop {r4, r5, r6, r7, r8, pc}
+
--- /dev/null
+@ static inline void volk_32fc_x2_dot_prod_32fc_neonasm_opttests(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)@
+.global volk_32fc_x2_dot_prod_32fc_neonasm_opttests
+volk_32fc_x2_dot_prod_32fc_neonasm_opttests:
+ push {r4, r5, r6, r7, r8, r9, sl, fp, lr}
+ vpush {d8-d15}
+ lsrs fp, r3, #3
+ sub sp, sp, #52 @ 0x34
+ mov r9, r3
+ mov sl, r0
+ mov r7, r1
+ mov r8, r2
+ vorr q0, q7, q7
+ vorr q1, q7, q7
+ vorr q2, q7, q7
+ vorr q3, q7, q7
+ vorr q4, q7, q7
+ vorr q5, q7, q7
+ veor q6, q7, q7
+ vorr q7, q7, q7
+ beq .smallvector
+ mov r4, r1
+ mov ip, r2
+ mov r3, #0
+.mainloop:
+ @mov r6, ip
+ @mov r5, r4
+ vld4.32 {d24,d26,d28,d30}, [r6]!
+ @add ip, ip, #64 @ 0x40
+ @add r4, r4, #64 @ 0x40
+ vld4.32 {d16,d18,d20,d22}, [r5]!
+ add r3, r3, #1
+ vld4.32 {d25,d27,d29,d31}, [r6]!
+ vld4.32 {d17,d19,d21,d23}, [r5]!
+ vmla.f32 q6, q8, q12
+ vmla.f32 q0, q9, q12
+ cmp r3, fp
+ vmls.f32 q5, q13, q9
+ vmla.f32 q2, q13, q8
+ vmla.f32 q7, q10, q14
+ vmla.f32 q1, q11, q14
+ vmls.f32 q4, q15, q11
+ vmla.f32 q3, q15, q10
+ bne .mainloop
+ lsl r3, fp, #6
+ add r8, r8, r3
+ add r7, r7, r3
+.smallvector:
+ vadd.f32 q3, q2, q3
+ add r3, sp, #16
+ lsl r4, fp, #3
+ vadd.f32 q4, q5, q4
+ cmp r9, r4
+ vadd.f32 q6, q6, q7
+ vadd.f32 q1, q0, q1
+ vadd.f32 q8, q6, q4
+ vadd.f32 q9, q1, q3
+ vst2.32 {d16-d19}, [r3 :64]
+ vldr s15, [sp, #24]
+ vldr s16, [sp, #16]
+ vldr s17, [sp, #20]
+ vadd.f32 s16, s16, s15
+ vldr s11, [sp, #28]
+ vldr s12, [sp, #40] @ 0x28
+ vldr s13, [sp, #44] @ 0x2c
+ vldr s14, [sp, #32]
+ vldr s15, [sp, #36] @ 0x24
+ vadd.f32 s17, s17, s11
+ vadd.f32 s16, s16, s12
+ vadd.f32 s17, s17, s13
+ vadd.f32 s16, s16, s14
+ vadd.f32 s17, s17, s15
+ vstr s16, [sl]
+ vstr s17, [sl, #4]
+ bls .epilog
+ add r5, sp, #8
+.tailcase:
+ ldr r3, [r7], #8
+ mov r0, r5
+ ldr r1, [r8], #8
+ add r4, r4, #1
+ ldr ip, [r7, #-4]
+ ldr r2, [r8, #-4]
+ str ip, [sp]
+ bl __mulsc3
+ vldr s14, [sp, #8]
+ vldr s15, [sp, #12]
+ vadd.f32 s16, s16, s14
+ cmp r4, r9
+ vadd.f32 s17, s17, s15
+ vstr s16, [sl]
+ vstr s17, [sl, #4]
+ bne .tailcase
+.epilog:
+ add sp, sp, #52 @ 0x34
+ vpop {d8-d15}
+ pop {r4, r5, r6, r7, r8, r9, sl, fp, pc}
--- /dev/null
+@ static inline void volk_32fc_x2_multiply_32fc_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+ .global volk_32fc_x2_multiply_32fc_neonasm
+volk_32fc_x2_multiply_32fc_neonasm:
+ push {r4, r5, r6, r7, r8, r9, r14}
+ lsrs r7, r3, #2
+ @ r0 is c vector
+ @ r1 is a vector
+ @ r2 is b vector
+ @ r3 is num_points
+ @ r7 is quarter_points
+ beq .smallvector
+ mov r5, #0
+.mainloop:
+ vld2.32 {d24-d27}, [r1]! @ ar=q12, ai=q13
+ add r5, r5, #1
+ cmp r5, r7
+ vld2.32 {d20-d23}, [r2]! @ br=q10, bi=q11
+ pld [r1]
+ pld [r2]
+ vmul.f32 q0, q12, q10 @ q15 = ar*br
+ vmul.f32 q1, q13, q11 @ q11 = ai*bi
+ vmul.f32 q2, q12, q11 @ q14 = ar*bi
+ vmul.f32 q3, q13, q10 @ q12 = ai*br
+ vsub.f32 q9, q0, q1 @ real
+ vadd.f32 q10, q2, q3 @ imag
+ vst2.32 {q9-q10}, [r0]!
+ bne .mainloop
+
+.smallvector:
+ lsl r5, r7, #2 @ r5 = quarter_points * 4
+ cmp r3, r5 @ num_points == quarter_points?
+ bls .done
+.tailcase:
+ add r5, r5, #1 @ r5 +=1 <- number++
+ vld1.32 d1, [r1]! @ s2, s3 = ar, ai
+ vld1.32 d0, [r2]! @ s0, s1 = br, bi
+ vmul.f32 s4, s0, s2 @ s4 = ar*br
+ vmul.f32 s5, s0, s3 @ s5 = ar*bi
+ vmls.f32 s4, s1, s3 @ s4 = s4 - ai*bi
+ vmla.f32 s5, s1, s2 @ s5 = s5 + ai*br
+ @vst2.32 d2[0], [r0]!
+ vst1.32 {d2}, [r0]!
+ cmp r3, r5 @ r3 == r5? num_points == number?
+ bne .tailcase
+.done:
+ pop {r4, r5, r6, r7, r8, r9, r15}
+ bx lr
--- /dev/null
+.function volk_16ic_deinterleave_16i_x2_a_orc_impl
+.dest 2 idst
+.dest 2 qdst
+.source 4 src
+splitlw qdst, idst, src
--- /dev/null
+.function volk_16ic_deinterleave_real_8i_a_orc_impl
+.dest 1 dst
+.source 4 src
+.temp 2 iw
+select0lw iw, src
+convhwb dst, iw
--- /dev/null
+.function volk_16ic_magnitude_16i_a_orc_impl
+.source 4 src
+.dest 2 dst
+.floatparam 4 scalar
+.temp 8 iql
+.temp 8 iqf
+.temp 8 prodiqf
+.temp 4 qf
+.temp 4 if
+.temp 4 sumf
+.temp 4 rootf
+.temp 4 rootl
+
+x2 convswl iql, src
+x2 convlf iqf, iql
+x2 divf iqf, iqf, scalar
+x2 mulf prodiqf, iqf, iqf
+splitql qf, if, prodiqf
+addf sumf, if, qf
+sqrtf rootf, sumf
+mulf rootf, rootf, scalar
+convfl rootl, rootf
+convlw dst, rootl
--- /dev/null
+.function volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl
+.dest 4 idst
+.dest 4 qdst
+.source 4 src
+.floatparam 4 scalar
+.temp 8 iql
+.temp 8 iqf
+
+x2 convswl iql, src
+x2 convlf iqf, iql
+x2 divf iqf, iqf, scalar
+splitql qdst, idst, iqf
--- /dev/null
+.function volk_16ic_magnitude_32f_a_orc_impl
+.source 4 src
+.dest 4 dst
+.floatparam 4 scalar
+.temp 4 reall
+.temp 4 imagl
+.temp 2 reals
+.temp 2 imags
+.temp 4 realf
+.temp 4 imagf
+.temp 4 sumf
+
+
+
+splitlw reals, imags, src
+convswl reall, reals
+convswl imagl, imags
+convlf realf, reall
+convlf imagf, imagl
+divf realf, realf, scalar
+divf imagf, imagf, scalar
+mulf realf, realf, realf
+mulf imagf, imagf, imagf
+addf sumf, realf, imagf
+sqrtf dst, sumf
--- /dev/null
+.function volk_16u_byteswap_a_orc_impl
+.dest 2 dst
+swapw dst, dst
--- /dev/null
+.function volk_32f_s32f_multiply_32f_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.floatparam 4 scalar
+mulf dst, src1, scalar
--- /dev/null
+.function volk_32f_s32f_normalize_a_orc_impl
+.source 4 src1
+.floatparam 4 invscalar
+.dest 4 dst
+mulf dst, src1, invscalar
--- /dev/null
+.function volk_32f_sqrt_32f_a_orc_impl
+.source 4 src
+.dest 4 dst
+sqrtf dst, src
--- /dev/null
+.function volk_32f_x2_add_32f_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+addf dst, src1, src2
--- /dev/null
+.function volk_32f_x2_divide_32f_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+divf dst, src1, src2
--- /dev/null
+.function volk_32f_x2_dot_prod_32f_a_orc_impl
+.source 4 src1
+.source 4 src2
+.dest 4 dst
+.accumulator 4 accum
+addf dst, src1, src2
--- /dev/null
+.function volk_32f_x2_max_32f_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+maxf dst, src1, src2
--- /dev/null
+.function volk_32f_x2_min_32f_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+minf dst, src1, src2
--- /dev/null
+.function volk_32f_x2_multiply_32f_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+mulf dst, src1, src2
--- /dev/null
+.function volk_32f_x2_subtract_32f_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+subf dst, src1, src2
--- /dev/null
+.function volk_32fc_32f_multiply_32fc_a_orc_impl
+.source 8 src1
+.source 4 src2
+.dest 8 dst
+.temp 8 tmp
+mergelq tmp, src2, src2
+x2 mulf dst, src1, tmp
--- /dev/null
+.function volk_32fc_magnitude_32f_a_orc_impl
+.source 8 src
+.dest 4 dst
+.temp 8 iqf
+.temp 8 prodiqf
+.temp 4 qf
+.temp 4 if
+.temp 4 sumf
+
+x2 mulf prodiqf, src, src
+splitql qf, if, prodiqf
+addf sumf, if, qf
+sqrtf dst, sumf
--- /dev/null
+.function volk_32fc_s32f_magnitude_16i_a_orc_impl
+.source 8 src
+.dest 2 dst
+.floatparam 4 scalar
+.temp 8 iqf
+.temp 8 prodiqf
+.temp 4 qf
+.temp 4 if
+.temp 4 sumf
+.temp 4 rootf
+.temp 4 rootl
+#.temp 4 maskl
+
+x2 mulf prodiqf, src, src
+splitql qf, if, prodiqf
+addf sumf, if, qf
+sqrtf rootf, sumf
+mulf rootf, rootf, scalar
+#cmpltf maskl, 32768.0, rootf
+#andl maskl, maskl, 0x80000000
+#orl rootf, rootf, maskl
+convfl rootl, rootf
+convsuslw dst, rootl
--- /dev/null
+.function volk_32fc_s32fc_multiply_32fc_a_orc_impl
+.source 8 src1
+.floatparam 8 scalar
+.dest 8 dst
+.temp 8 iqprod
+.temp 4 real
+.temp 4 imag
+.temp 4 ac
+.temp 4 bd
+.temp 8 swapped
+x2 mulf iqprod, src1, scalar
+splitql bd, ac, iqprod
+subf real, ac, bd
+swaplq swapped, src1
+x2 mulf iqprod, swapped, scalar
+splitql bd, ac, iqprod
+addf imag, ac, bd
+mergelq dst, real, imag
--- /dev/null
+.function volk_32fc_x2_multiply_32fc_a_orc_impl
+.source 8 src1
+.source 8 src2
+.dest 8 dst
+.temp 8 iqprod
+.temp 4 real
+.temp 4 imag
+.temp 4 ac
+.temp 4 bd
+.temp 8 swapped
+x2 mulf iqprod, src1, src2
+splitql bd, ac, iqprod
+subf real, ac, bd
+swaplq swapped, src1
+x2 mulf iqprod, swapped, src2
+splitql bd, ac, iqprod
+addf imag, ac, bd
+mergelq dst, real, imag
--- /dev/null
+.function volk_32i_x2_and_32i_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+andl dst, src1, src2
--- /dev/null
+.function volk_32i_x2_or_32i_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+orl dst, src1, src2
--- /dev/null
+.function volk_8i_convert_16i_a_orc_impl
+.source 1 src
+.dest 2 dst
+.temp 2 tmp
+convsbw tmp, src
+shlw dst, tmp, 8
--- /dev/null
+.function volk_8i_s32f_convert_32f_a_orc_impl
+.source 1 src
+.dest 4 dst
+.floatparam 4 scalar
+.temp 4 flsrc
+.temp 4 lsrc
+.temp 2 ssrc
+convsbw ssrc, src
+convswl lsrc, ssrc
+convlf flsrc, lsrc
+mulf dst, flsrc, scalar
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_16i_32fc_dot_prod_32fc
+ *
+ * \b Overview
+ *
+ * This block computes the dot product (or inner product) between two
+ * vectors, the \p input and \p taps vectors. Given a set of \p
+ * num_points taps, the result is the sum of products between the two
+ * vectors. The result is a single value stored in the \p result
+ * address and will be complex.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16i_32fc_dot_prod_32fc(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li input: vector of shorts.
+ * \li taps: complex taps.
+ * \li num_points: number of samples in both \p input and \p taps.
+ *
+ * \b Outputs
+ * \li result: pointer to a complex value to hold the dot product result.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * <FIXME>
+ *
+ * volk_16i_32fc_dot_prod_32fc();
+ *
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
+#define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
+
+#include <volk/volk_common.h>
+#include <stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
+
+ static const int N_UNROLL = 4;
+
+ lv_32fc_t acc0 = 0;
+ lv_32fc_t acc1 = 0;
+ lv_32fc_t acc2 = 0;
+ lv_32fc_t acc3 = 0;
+
+ unsigned i = 0;
+ unsigned n = (num_points / N_UNROLL) * N_UNROLL;
+
+ for(i = 0; i < n; i += N_UNROLL) {
+ acc0 += taps[i + 0] * (float)input[i + 0];
+ acc1 += taps[i + 1] * (float)input[i + 1];
+ acc2 += taps[i + 2] * (float)input[i + 2];
+ acc3 += taps[i + 3] * (float)input[i + 3];
+ }
+
+ for(; i < num_points; i++) {
+ acc0 += taps[i] * (float)input[i];
+ }
+
+ *result = acc0 + acc1 + acc2 + acc3;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
+
+ unsigned ii;
+ unsigned quarter_points = num_points / 4;
+ lv_32fc_t* tapsPtr = (lv_32fc_t*) taps;
+ short* inputPtr = (short*) input;
+ lv_32fc_t accumulator_vec[4];
+
+ float32x4x2_t tapsVal, accumulator_val;
+ int16x4_t input16;
+ int32x4_t input32;
+ float32x4_t input_float, prod_re, prod_im;
+
+ accumulator_val.val[0] = vdupq_n_f32(0.0);
+ accumulator_val.val[1] = vdupq_n_f32(0.0);
+
+ for(ii = 0; ii < quarter_points; ++ii) {
+ tapsVal = vld2q_f32((float*)tapsPtr);
+ input16 = vld1_s16(inputPtr);
+ // widen 16-bit int to 32-bit int
+ input32 = vmovl_s16(input16);
+ // convert 32-bit int to float with scale
+ input_float = vcvtq_f32_s32(input32);
+
+ prod_re = vmulq_f32(input_float, tapsVal.val[0]);
+ prod_im = vmulq_f32(input_float, tapsVal.val[1]);
+
+ accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
+ accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
+
+ tapsPtr += 4;
+ inputPtr += 4;
+ }
+ vst2q_f32((float*)accumulator_vec, accumulator_val);
+ accumulator_vec[0] += accumulator_vec[1];
+ accumulator_vec[2] += accumulator_vec[3];
+ accumulator_vec[0] += accumulator_vec[2];
+
+ for(ii = quarter_points * 4; ii < num_points; ++ii) {
+ accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++));
+ }
+
+ *result = accumulator_vec[0];
+}
+
+#endif /*LV_HAVE_NEON*/
+
+#if LV_HAVE_SSE && LV_HAVE_MMX
+
+static inline void volk_16i_32fc_dot_prod_32fc_u_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 8;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const short* aPtr = input;
+ const float* bPtr = (float*)taps;
+
+ __m64 m0, m1;
+ __m128 f0, f1, f2, f3;
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
+ m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
+ f0 = _mm_cvtpi16_ps(m0);
+ f1 = _mm_cvtpi16_ps(m0);
+ f2 = _mm_cvtpi16_ps(m1);
+ f3 = _mm_cvtpi16_ps(m1);
+
+ a0Val = _mm_unpacklo_ps(f0, f1);
+ a1Val = _mm_unpackhi_ps(f0, f1);
+ a2Val = _mm_unpacklo_ps(f2, f3);
+ a3Val = _mm_unpackhi_ps(f2, f3);
+
+ b0Val = _mm_loadu_ps(bPtr);
+ b1Val = _mm_loadu_ps(bPtr+4);
+ b2Val = _mm_loadu_ps(bPtr+8);
+ b3Val = _mm_loadu_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 8;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+
+ number = sixteenthPoints*8;
+ for(;number < num_points; number++){
+ *realpt += ((*aPtr) * (*bPtr++));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
+
+
+
+
+#if LV_HAVE_SSE && LV_HAVE_MMX
+
+
+static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 8;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const short* aPtr = input;
+ const float* bPtr = (float*)taps;
+
+ __m64 m0, m1;
+ __m128 f0, f1, f2, f3;
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
+ m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
+ f0 = _mm_cvtpi16_ps(m0);
+ f1 = _mm_cvtpi16_ps(m0);
+ f2 = _mm_cvtpi16_ps(m1);
+ f3 = _mm_cvtpi16_ps(m1);
+
+ a0Val = _mm_unpacklo_ps(f0, f1);
+ a1Val = _mm_unpackhi_ps(f0, f1);
+ a2Val = _mm_unpacklo_ps(f2, f3);
+ a3Val = _mm_unpackhi_ps(f2, f3);
+
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 8;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+
+ number = sixteenthPoints*8;
+ for(;number < num_points; number++){
+ *realpt += ((*aPtr) * (*bPtr++));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
+
+
+#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_16i_branch_4_state_8
+ *
+ * \b Overview
+ *
+ * <FIXME>
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16i_branch_4_state_8(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars)
+ * \endcode
+ *
+ * \b Inputs
+ * \li src0: <FIXME>
+ * \li permuters: <FIXME>
+ * \li cntl2: <FIXME>
+ * \li cntl3: <FIXME>
+ * \li scalars: <FIXME>
+ *
+ * \b Outputs
+ * \li target: <FIXME>
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16i_branch_4_state_8();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16i_branch_4_state_8_a_H
+#define INCLUDED_volk_16i_branch_4_state_8_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSSE3
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+static inline void
+volk_16i_branch_4_state_8_a_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars)
+{
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11;
+ __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars;
+
+ p_target = (__m128i*)target;
+ p_src0 = (__m128i*)src0;
+ p_cntl2 = (__m128i*)cntl2;
+ p_cntl3 = (__m128i*)cntl3;
+ p_scalars = (__m128i*)scalars;
+
+ int i = 0;
+
+ int bound = 1;
+
+ xmm0 = _mm_load_si128(p_scalars);
+
+ xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+ xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+ xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+ xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+
+ xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+ xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+ xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+ xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+
+ xmm0 = _mm_load_si128((__m128i*)permuters[0]);
+ xmm6 = _mm_load_si128((__m128i*)permuters[1]);
+ xmm8 = _mm_load_si128((__m128i*)permuters[2]);
+ xmm10 = _mm_load_si128((__m128i*)permuters[3]);
+
+ for(; i < bound; ++i) {
+ xmm5 = _mm_load_si128(p_src0);
+ xmm0 = _mm_shuffle_epi8(xmm5, xmm0);
+ xmm6 = _mm_shuffle_epi8(xmm5, xmm6);
+ xmm8 = _mm_shuffle_epi8(xmm5, xmm8);
+ xmm10 = _mm_shuffle_epi8(xmm5, xmm10);
+
+ p_src0 += 4;
+
+ xmm5 = _mm_add_epi16(xmm1, xmm2);
+
+ xmm6 = _mm_add_epi16(xmm2, xmm6);
+ xmm8 = _mm_add_epi16(xmm1, xmm8);
+
+ xmm7 = _mm_load_si128(p_cntl2);
+ xmm9 = _mm_load_si128(p_cntl3);
+
+ xmm0 = _mm_add_epi16(xmm5, xmm0);
+
+ xmm7 = _mm_and_si128(xmm7, xmm3);
+ xmm9 = _mm_and_si128(xmm9, xmm4);
+
+ xmm5 = _mm_load_si128(&p_cntl2[1]);
+ xmm11 = _mm_load_si128(&p_cntl3[1]);
+
+ xmm7 = _mm_add_epi16(xmm7, xmm9);
+
+ xmm5 = _mm_and_si128(xmm5, xmm3);
+ xmm11 = _mm_and_si128(xmm11, xmm4);
+
+ xmm0 = _mm_add_epi16(xmm0, xmm7);
+
+
+ xmm7 = _mm_load_si128(&p_cntl2[2]);
+ xmm9 = _mm_load_si128(&p_cntl3[2]);
+
+ xmm5 = _mm_add_epi16(xmm5, xmm11);
+
+ xmm7 = _mm_and_si128(xmm7, xmm3);
+ xmm9 = _mm_and_si128(xmm9, xmm4);
+
+ xmm6 = _mm_add_epi16(xmm6, xmm5);
+
+
+ xmm5 = _mm_load_si128(&p_cntl2[3]);
+ xmm11 = _mm_load_si128(&p_cntl3[3]);
+
+ xmm7 = _mm_add_epi16(xmm7, xmm9);
+
+ xmm5 = _mm_and_si128(xmm5, xmm3);
+ xmm11 = _mm_and_si128(xmm11, xmm4);
+
+ xmm8 = _mm_add_epi16(xmm8, xmm7);
+
+ xmm5 = _mm_add_epi16(xmm5, xmm11);
+
+ _mm_store_si128(p_target, xmm0);
+ _mm_store_si128(&p_target[1], xmm6);
+
+ xmm10 = _mm_add_epi16(xmm5, xmm10);
+
+ _mm_store_si128(&p_target[2], xmm8);
+
+ _mm_store_si128(&p_target[3], xmm10);
+
+ p_target += 3;
+ }
+}
+
+
+#endif /*LV_HAVE_SSEs*/
+
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_16i_branch_4_state_8_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars)
+{
+ int i = 0;
+
+ int bound = 4;
+
+ for(; i < bound; ++i) {
+ target[i* 8] = src0[((char)permuters[i][0])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8] & scalars[2])
+ + (cntl3[i * 8] & scalars[3]);
+ target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 1] & scalars[2])
+ + (cntl3[i * 8 + 1] & scalars[3]);
+ target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 2] & scalars[2])
+ + (cntl3[i * 8 + 2] & scalars[3]);
+ target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 3] & scalars[2])
+ + (cntl3[i * 8 + 3] & scalars[3]);
+ target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 4] & scalars[2])
+ + (cntl3[i * 8 + 4] & scalars[3]);
+ target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 5] & scalars[2])
+ + (cntl3[i * 8 + 5] & scalars[3]);
+ target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 6] & scalars[2])
+ + (cntl3[i * 8 + 6] & scalars[3]);
+ target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2]
+ + ((i + 1)%2 * scalars[0])
+ + (((i >> 1)^1) * scalars[1])
+ + (cntl2[i * 8 + 7] & scalars[2])
+ + (cntl3[i * 8 + 7] & scalars[3]);
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_16i_branch_4_state_8_a_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_16i_convert_8i
+ *
+ * \b Overview
+ *
+ * Converts 16-bit shorts to 8-bit chars.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16i_convert_8i(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The input vector of 16-bit shorts.
+ * \li num_points: The number of complex data points.
+ *
+ * \b Outputs
+ * \li outputVector: The output vector of 8-bit chars.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16i_convert_8i();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16i_convert_8i_u_H
+#define INCLUDED_volk_16i_convert_8i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void
+volk_16i_convert_8i_u_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ int8_t* outputVectorPtr = outputVector;
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal1;
+ __m128i inputVal2;
+ __m128i ret;
+
+ for(;number < sixteenthPoints; number++){
+
+ // Load the 16 values
+ inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
+ inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
+
+ inputVal1 = _mm_srai_epi16(inputVal1, 8);
+ inputVal2 = _mm_srai_epi16(inputVal2, 8);
+
+ ret = _mm_packs_epi16(inputVal1, inputVal2);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
+
+ outputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] =(int8_t)(inputVector[number] >> 8);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_16i_convert_8i_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
+{
+ int8_t* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16i_convert_8i_u_H */
+#ifndef INCLUDED_volk_16i_convert_8i_a_H
+#define INCLUDED_volk_16i_convert_8i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void
+volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ int8_t* outputVectorPtr = outputVector;
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal1;
+ __m128i inputVal2;
+ __m128i ret;
+
+ for(;number < sixteenthPoints; number++){
+
+ // Load the 16 values
+ inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+ inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+
+ inputVal1 = _mm_srai_epi16(inputVal1, 8);
+ inputVal2 = _mm_srai_epi16(inputVal2, 8);
+
+ ret = _mm_packs_epi16(inputVal1, inputVal2);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, ret);
+
+ outputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] =(int8_t)(inputVector[number] >> 8);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_16i_convert_8i_neon(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
+{
+ int8_t* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ unsigned int sixteenth_points = num_points / 16;
+
+ int16x8_t inputVal0;
+ int16x8_t inputVal1;
+ int8x8_t outputVal0;
+ int8x8_t outputVal1;
+ int8x16_t outputVal;
+
+ for(number = 0; number < sixteenth_points; number++){
+ // load two input vectors
+ inputVal0 = vld1q_s16(inputVectorPtr);
+ inputVal1 = vld1q_s16(inputVectorPtr+8);
+ // shift right
+ outputVal0 = vshrn_n_s16(inputVal0, 8);
+ outputVal1 = vshrn_n_s16(inputVal1, 8);
+ // squash two vectors and write output
+ outputVal = vcombine_s8(outputVal0, outputVal1);
+ vst1q_s8(outputVectorPtr, outputVal);
+ inputVectorPtr += 16;
+ outputVectorPtr += 16;
+ }
+
+ for(number = sixteenth_points * 16; number < num_points; number++){
+ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_16i_convert_8i_a_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
+{
+ int8_t* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_16i_convert_8i_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_16i_max_star_16i
+ *
+ * \b Overview
+ *
+ * <FIXME>
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16i_max_star_16i(short* target, short* src0, unsigned int num_points);
+ * \endcode
+ *
+ * \b Inputs
+ * \li src0: The input vector.
+ * \li num_points: The number of complex data points.
+ *
+ * \b Outputs
+ * \li target: The output value of the max* operation.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16i_max_star_16i();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16i_max_star_16i_a_H
+#define INCLUDED_volk_16i_max_star_16i_a_H
+
+#include<inttypes.h>
+#include<stdio.h>
+
+#ifdef LV_HAVE_SSSE3
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+#include<tmmintrin.h>
+
+static inline void
+volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_points)
+{
+ const unsigned int num_bytes = num_points*2;
+
+ short candidate = src0[0];
+ short cands[8];
+ __m128i xmm0, xmm1, xmm3, xmm4, xmm5, xmm6;
+
+ __m128i *p_src0;
+
+ p_src0 = (__m128i*)src0;
+
+ int bound = num_bytes >> 4;
+ int leftovers = (num_bytes >> 1) & 7;
+
+ int i = 0;
+
+ xmm1 = _mm_setzero_si128();
+ xmm0 = _mm_setzero_si128();
+ //_mm_insert_epi16(xmm0, candidate, 0);
+
+ xmm0 = _mm_shuffle_epi8(xmm0, xmm1);
+
+ for(i = 0; i < bound; ++i) {
+ xmm1 = _mm_load_si128(p_src0);
+ p_src0 += 1;
+ //xmm2 = _mm_sub_epi16(xmm1, xmm0);
+
+ xmm3 = _mm_cmpgt_epi16(xmm0, xmm1);
+ xmm4 = _mm_cmpeq_epi16(xmm0, xmm1);
+ xmm5 = _mm_cmpgt_epi16(xmm1, xmm0);
+
+ xmm6 = _mm_xor_si128(xmm4, xmm5);
+
+ xmm3 = _mm_and_si128(xmm3, xmm0);
+ xmm4 = _mm_and_si128(xmm6, xmm1);
+
+ xmm0 = _mm_add_epi16(xmm3, xmm4);
+ }
+
+ _mm_store_si128((__m128i*)cands, xmm0);
+
+ for(i = 0; i < 8; ++i) {
+ candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i];
+ }
+
+ for(i = 0; i < leftovers; ++i) {
+ candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i];
+ }
+
+ target[0] = candidate;
+}
+
+#endif /*LV_HAVE_SSSE3*/
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_16i_max_star_16i_neon(short* target, short* src0, unsigned int num_points)
+{
+ const unsigned int eighth_points = num_points / 8;
+ unsigned number;
+ int16x8_t input_vec;
+ int16x8_t diff, zeros;
+ uint16x8_t comp1, comp2;
+ zeros = veorq_s16(zeros, zeros);
+
+ int16x8x2_t tmpvec;
+
+ int16x8_t candidate_vec = vld1q_dup_s16(src0 );
+ short candidate;
+ ++src0;
+
+ for(number=0; number < eighth_points; ++number) {
+ input_vec = vld1q_s16(src0);
+ __VOLK_PREFETCH(src0+16);
+ diff = vsubq_s16(candidate_vec, input_vec);
+ comp1 = vcgeq_s16(diff, zeros);
+ comp2 = vcltq_s16(diff, zeros);
+
+ tmpvec.val[0] = vandq_s16(candidate_vec, (int16x8_t)comp1);
+ tmpvec.val[1] = vandq_s16(input_vec, (int16x8_t)comp2);
+
+ candidate_vec = vaddq_s16(tmpvec.val[0], tmpvec.val[1]);
+ src0 += 8;
+ }
+ vst1q_s16(&candidate, candidate_vec);
+
+ for(number=0; number < num_points%8; number++) {
+ candidate = ((int16_t)(candidate - src0[number]) > 0) ? candidate : src0[number];
+ }
+ target[0] = candidate;
+}
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_16i_max_star_16i_generic(short* target, short* src0, unsigned int num_points)
+{
+ const unsigned int num_bytes = num_points*2;
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+ short candidate = src0[0];
+ for(i = 1; i < bound; ++i) {
+ candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i];
+ }
+ target[0] = candidate;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_16i_max_star_16i_a_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_16i_max_star_horizontal_16i
+ *
+ * \b Overview
+ *
+ * <FIXME>
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16i_max_star_horizontal_16i(short* target, short* src0, unsigned int num_points);
+ * \endcode
+ *
+ * \b Inputs
+ * \li src0: The input vector.
+ * \li num_points: The number of complex data points.
+ *
+ * \b Outputs
+ * \li target: The output value of the max* operation.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16i_max_star_horizontal_16i();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H
+#define INCLUDED_volk_16i_max_star_horizontal_16i_a_H
+
+#include <volk/volk_common.h>
+
+#include<inttypes.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_SSSE3
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+#include<tmmintrin.h>
+
+static inline void
+volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigned int num_points)
+{
+ const unsigned int num_bytes = num_points*2;
+
+ static const uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+ static const uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
+ 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d};
+ static const uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+ static const uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02};
+
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+ __m128i xmm5, xmm6, xmm7, xmm8;
+
+ xmm4 = _mm_load_si128((__m128i*)shufmask0);
+ xmm5 = _mm_load_si128((__m128i*)shufmask1);
+ xmm6 = _mm_load_si128((__m128i*)andmask0);
+ xmm7 = _mm_load_si128((__m128i*)andmask1);
+
+ __m128i *p_target, *p_src0;
+
+ p_target = (__m128i*)target;
+ p_src0 = (__m128i*)src0;
+
+ int bound = num_bytes >> 5;
+ int intermediate = (num_bytes >> 4) & 1;
+ int leftovers = (num_bytes >> 1) & 7;
+
+ int i = 0;
+
+ for(i = 0; i < bound; ++i) {
+ xmm0 = _mm_load_si128(p_src0);
+ xmm1 = _mm_load_si128(&p_src0[1]);
+
+ xmm2 = _mm_xor_si128(xmm2, xmm2);
+ p_src0 += 2;
+
+ xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+
+ xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
+
+ xmm8 = _mm_and_si128(xmm2, xmm6);
+ xmm3 = _mm_and_si128(xmm2, xmm7);
+
+
+ xmm8 = _mm_add_epi8(xmm8, xmm4);
+ xmm3 = _mm_add_epi8(xmm3, xmm5);
+
+ xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
+ xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
+
+
+ xmm3 = _mm_add_epi16(xmm0, xmm1);
+
+
+ _mm_store_si128(p_target, xmm3);
+
+ p_target += 1;
+ }
+
+ for(i = 0; i < intermediate; ++i) {
+ xmm0 = _mm_load_si128(p_src0);
+
+ xmm2 = _mm_xor_si128(xmm2, xmm2);
+ p_src0 += 1;
+
+ xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+ xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
+
+ xmm8 = _mm_and_si128(xmm2, xmm6);
+
+ xmm3 = _mm_add_epi8(xmm8, xmm4);
+
+ xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
+
+ _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec);
+
+ p_target = (__m128i*)((int8_t*)p_target + 8);
+ }
+
+ for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) {
+ target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
+ }
+}
+
+#endif /*LV_HAVE_SSSE3*/
+
+#ifdef LV_HAVE_NEON
+
+#include <arm_neon.h>
+static inline void
+volk_16i_max_star_horizontal_16i_neon(int16_t* target, int16_t* src0, unsigned int num_points)
+{
+ const unsigned int eighth_points = num_points / 16;
+ unsigned number;
+ int16x8x2_t input_vec;
+ int16x8_t diff, max_vec, zeros;
+ uint16x8_t comp1, comp2;
+ zeros = veorq_s16(zeros, zeros);
+ for(number=0; number < eighth_points; ++number) {
+ input_vec = vld2q_s16(src0);
+ //__VOLK_PREFETCH(src0+16);
+ diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
+ comp1 = vcgeq_s16(diff, zeros);
+ comp2 = vcltq_s16(diff, zeros);
+
+ input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1);
+ input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2);
+
+ max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]);
+ vst1q_s16(target, max_vec);
+ src0 += 16;
+ target += 8;
+ }
+ for(number=0; number < num_points%16; number+=2) {
+ target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0) ? src0[number] : src0[number+1];
+ }
+
+}
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEON
+extern void volk_16i_max_star_horizontal_16i_neonasm(int16_t* target, int16_t* src0, unsigned int num_points);
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_16i_max_star_horizontal_16i_generic(int16_t* target, int16_t* src0, unsigned int num_points)
+{
+ const unsigned int num_bytes = num_points*2;
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+ for(i = 0; i < bound; i += 2) {
+ target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1];
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_16i_permute_and_scalar_add
+ *
+ * \b Overview
+ *
+ * <FIXME>
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16i_permute_and_scalar_add(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li src0: The input vector.
+ * \li permute_indexes: <FIXME>
+ * \li cntl0: <FIXME>
+ * \li cntl1: <FIXME>
+ * \li cntl2: <FIXME>
+ * \li cntl3: <FIXME>
+ * \li scalars: <FIXME>
+ * \li num_points: The number of complex data points.
+ *
+ * \b Outputs
+ * \li target: The output value.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16i_permute_and_scalar_add();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16i_permute_and_scalar_add_a_H
+#define INCLUDED_volk_16i_permute_and_scalar_add_a_H
+
+#include<inttypes.h>
+#include<stdio.h>
+
+#ifdef LV_HAVE_SSE2
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+
+static inline void
+volk_16i_permute_and_scalar_add_a_sse2(short* target, short* src0, short* permute_indexes,
+ short* cntl0, short* cntl1, short* cntl2, short* cntl3,
+ short* scalars, unsigned int num_points)
+{
+
+ const unsigned int num_bytes = num_points*2;
+
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+ __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
+
+ short* p_permute_indexes = permute_indexes;
+
+ p_target = (__m128i*)target;
+ p_cntl0 = (__m128i*)cntl0;
+ p_cntl1 = (__m128i*)cntl1;
+ p_cntl2 = (__m128i*)cntl2;
+ p_cntl3 = (__m128i*)cntl3;
+ p_scalars = (__m128i*)scalars;
+
+ int i = 0;
+
+ int bound = (num_bytes >> 4);
+ int leftovers = (num_bytes >> 1) & 7;
+
+ xmm0 = _mm_load_si128(p_scalars);
+
+ xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+ xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+ xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+ xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+
+ xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+ xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+ xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+ xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+
+
+ for(; i < bound; ++i) {
+ xmm0 = _mm_setzero_si128();
+ xmm5 = _mm_setzero_si128();
+ xmm6 = _mm_setzero_si128();
+ xmm7 = _mm_setzero_si128();
+
+ xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
+ xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
+ xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
+ xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
+ xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
+ xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
+ xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
+ xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
+
+ xmm0 = _mm_add_epi16(xmm0, xmm5);
+ xmm6 = _mm_add_epi16(xmm6, xmm7);
+
+ p_permute_indexes += 8;
+
+ xmm0 = _mm_add_epi16(xmm0, xmm6);
+
+ xmm5 = _mm_load_si128(p_cntl0);
+ xmm6 = _mm_load_si128(p_cntl1);
+ xmm7 = _mm_load_si128(p_cntl2);
+
+ xmm5 = _mm_and_si128(xmm5, xmm1);
+ xmm6 = _mm_and_si128(xmm6, xmm2);
+ xmm7 = _mm_and_si128(xmm7, xmm3);
+
+ xmm0 = _mm_add_epi16(xmm0, xmm5);
+
+ xmm5 = _mm_load_si128(p_cntl3);
+
+ xmm6 = _mm_add_epi16(xmm6, xmm7);
+
+ p_cntl0 += 1;
+
+ xmm5 = _mm_and_si128(xmm5, xmm4);
+
+ xmm0 = _mm_add_epi16(xmm0, xmm6);
+
+ p_cntl1 += 1;
+ p_cntl2 += 1;
+
+ xmm0 = _mm_add_epi16(xmm0, xmm5);
+
+ p_cntl3 += 1;
+
+ _mm_store_si128(p_target, xmm0);
+
+ p_target += 1;
+ }
+
+ for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+ target[i] = src0[permute_indexes[i]]
+ + (cntl0[i] & scalars[0])
+ + (cntl1[i] & scalars[1])
+ + (cntl2[i] & scalars[2])
+ + (cntl3[i] & scalars[3]);
+ }
+}
+#endif /*LV_HAVE_SSE*/
+
+
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_16i_permute_and_scalar_add_generic(short* target, short* src0, short* permute_indexes,
+ short* cntl0, short* cntl1, short* cntl2, short* cntl3,
+ short* scalars, unsigned int num_points)
+{
+ const unsigned int num_bytes = num_points*2;
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+ for(i = 0; i < bound; ++i) {
+ target[i] = src0[permute_indexes[i]]
+ + (cntl0[i] & scalars[0])
+ + (cntl1[i] & scalars[1])
+ + (cntl2[i] & scalars[2])
+ + (cntl3[i] & scalars[3]);
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_16i_permute_and_scalar_add_a_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_16i_s32f_convert_32f
+ *
+ * \b Overview
+ *
+ * Converts 16-bit shorts to scaled 32-bit floating point values.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16i_s32f_convert_32f(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points);
+ * \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The input vector of 16-bit shorts.
+ * \li scalar: The value divided against each point in the output buffer.
+ * \li num_points: The number of complex data points.
+ *
+ * \b Outputs
+ * \li outputVector: The output vector of 8-bit chars.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16i_s32f_convert_32f();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16i_s32f_convert_32f_u_H
+#define INCLUDED_volk_16i_s32f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_16i_s32f_convert_32f_u_avx(float* outputVector, const int16_t* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal, inputVal2;
+ __m128 ret;
+ __m256 output;
+ __m256 dummy = _mm256_setzero_ps();
+
+ for(;number < eighthPoints; number++){
+
+ // Load the 8 values
+ //inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+ // Shift the input data to the right by 64 bits ( 8 bytes )
+ inputVal2 = _mm_srli_si128(inputVal, 8);
+
+ // Convert the lower 4 values into 32 bit words
+ inputVal = _mm_cvtepi16_epi32(inputVal);
+ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ output = _mm256_insertf128_ps(dummy, ret, 0);
+
+ ret = _mm_cvtepi32_ps(inputVal2);
+ ret = _mm_mul_ps(ret, invScalar);
+ output = _mm256_insertf128_ps(output, ret, 1);
+
+ _mm256_storeu_ps(outputVectorPtr, output);
+
+ outputVectorPtr += 8;
+
+ inputPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) / scalar;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal;
+ __m128i inputVal2;
+ __m128 ret;
+
+ for(;number < eighthPoints; number++){
+
+ // Load the 8 values
+ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+ // Shift the input data to the right by 64 bits ( 8 bytes )
+ inputVal2 = _mm_srli_si128(inputVal, 8);
+
+ // Convert the lower 4 values into 32 bit words
+ inputVal = _mm_cvtepi16_epi32(inputVal);
+ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ ret = _mm_cvtepi32_ps(inputVal2);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+
+ inputPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128 ret;
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ inputPtr += 4;
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ float* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_16i_s32f_convert_32f_neon(float* outputVector, const int16_t* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ float* outputPtr = outputVector;
+ const int16_t* inputPtr = inputVector;
+ unsigned int number = 0;
+ unsigned int eighth_points = num_points / 8;
+
+ int16x4x2_t input16;
+ int32x4_t input32_0, input32_1;
+ float32x4_t input_float_0, input_float_1;
+ float32x4x2_t output_float;
+ float32x4_t inv_scale;
+
+ inv_scale = vdupq_n_f32(1.0/scalar);
+
+ // the generic disassembles to a 128-bit load
+ // and duplicates every instruction to operate on 64-bits
+ // at a time. This is only possible with lanes, which is faster
+ // than just doing a vld1_s16, but still slower.
+ for(number = 0; number < eighth_points; number++){
+ input16 = vld2_s16(inputPtr);
+ // widen 16-bit int to 32-bit int
+ input32_0 = vmovl_s16(input16.val[0]);
+ input32_1 = vmovl_s16(input16.val[1]);
+ // convert 32-bit int to float with scale
+ input_float_0 = vcvtq_f32_s32(input32_0);
+ input_float_1 = vcvtq_f32_s32(input32_1);
+ output_float.val[0] = vmulq_f32(input_float_0, inv_scale);
+ output_float.val[1] = vmulq_f32(input_float_1, inv_scale);
+ vst2q_f32(outputPtr, output_float);
+ inputPtr += 8;
+ outputPtr += 8;
+ }
+
+ for(number = eighth_points*8; number < num_points; number++){
+ *outputPtr++ = ((float)(*inputPtr++)) / scalar;
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#endif /* INCLUDED_volk_16i_s32f_convert_32f_u_H */
+#ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H
+#define INCLUDED_volk_16i_s32f_convert_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_16i_s32f_convert_32f_a_avx(float* outputVector, const int16_t* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal, inputVal2;
+ __m128 ret;
+ __m256 output;
+ __m256 dummy = _mm256_setzero_ps();
+
+ for(;number < eighthPoints; number++){
+
+ // Load the 8 values
+ //inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+ inputVal = _mm_load_si128((__m128i*)inputPtr);
+
+ // Shift the input data to the right by 64 bits ( 8 bytes )
+ inputVal2 = _mm_srli_si128(inputVal, 8);
+
+ // Convert the lower 4 values into 32 bit words
+ inputVal = _mm_cvtepi16_epi32(inputVal);
+ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ output = _mm256_insertf128_ps(dummy, ret, 0);
+
+ ret = _mm_cvtepi32_ps(inputVal2);
+ ret = _mm_mul_ps(ret, invScalar);
+ output = _mm256_insertf128_ps(output, ret, 1);
+
+ _mm256_store_ps(outputVectorPtr, output);
+
+ outputVectorPtr += 8;
+
+ inputPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) / scalar;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal;
+ __m128i inputVal2;
+ __m128 ret;
+
+ for(;number < eighthPoints; number++){
+
+ // Load the 8 values
+ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+ // Shift the input data to the right by 64 bits ( 8 bytes )
+ inputVal2 = _mm_srli_si128(inputVal, 8);
+
+ // Convert the lower 4 values into 32 bit words
+ inputVal = _mm_cvtepi16_epi32(inputVal);
+ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ ret = _mm_cvtepi32_ps(inputVal2);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+
+ inputPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128 ret;
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ inputPtr += 4;
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ float* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_16i_x4_quad_max_star_16i
+ *
+ * \b Overview
+ *
+ * <FIXME>
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16i_x4_quad_max_star_16i(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li src0: The input vector 0.
+ * \li src1: The input vector 1.
+ * \li src2: The input vector 2.
+ * \li src3: The input vector 3.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li target: The output value.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16i_x4_quad_max_star_16i();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
+#define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
+
+#include<inttypes.h>
+#include<stdio.h>
+
+#ifdef LV_HAVE_SSE2
+
+#include<emmintrin.h>
+
+static inline void
+volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1,
+ short* src2, short* src3, unsigned int num_points)
+{
+ const unsigned int num_bytes = num_points*2;
+
+ int i = 0;
+
+ int bound = (num_bytes >> 4);
+ int bound_copy = bound;
+ int leftovers = (num_bytes >> 1) & 7;
+
+ __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
+ p_target = (__m128i*) target;
+ p_src0 = (__m128i*)src0;
+ p_src1 = (__m128i*)src1;
+ p_src2 = (__m128i*)src2;
+ p_src3 = (__m128i*)src3;
+
+ __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+
+ while(bound_copy > 0) {
+ xmm1 = _mm_load_si128(p_src0);
+ xmm2 = _mm_load_si128(p_src1);
+ xmm3 = _mm_load_si128(p_src2);
+ xmm4 = _mm_load_si128(p_src3);
+
+ xmm5 = _mm_setzero_si128();
+ xmm6 = _mm_setzero_si128();
+ xmm7 = xmm1;
+ xmm8 = xmm3;
+
+ xmm1 = _mm_sub_epi16(xmm2, xmm1);
+
+ xmm3 = _mm_sub_epi16(xmm4, xmm3);
+
+ xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
+ xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
+
+ xmm2 = _mm_and_si128(xmm5, xmm2);
+ xmm4 = _mm_and_si128(xmm6, xmm4);
+ xmm5 = _mm_andnot_si128(xmm5, xmm7);
+ xmm6 = _mm_andnot_si128(xmm6, xmm8);
+
+ xmm5 = _mm_add_epi16(xmm2, xmm5);
+ xmm6 = _mm_add_epi16(xmm4, xmm6);
+
+ xmm1 = _mm_xor_si128(xmm1, xmm1);
+ xmm2 = xmm5;
+ xmm5 = _mm_sub_epi16(xmm6, xmm5);
+ p_src0 += 1;
+ bound_copy -= 1;
+
+ xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
+ p_src1 += 1;
+
+ xmm6 = _mm_and_si128(xmm1, xmm6);
+
+ xmm1 = _mm_andnot_si128(xmm1, xmm2);
+ p_src2 += 1;
+
+ xmm1 = _mm_add_epi16(xmm6, xmm1);
+ p_src3 += 1;
+
+ _mm_store_si128(p_target, xmm1);
+ p_target += 1;
+
+ }
+
+
+ /*asm volatile
+ (
+ "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t"
+ "cmp $0, %[bound]\n\t"
+ "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t"
+
+ "movaps (%[src0]), %%xmm1\n\t"
+ "movaps (%[src1]), %%xmm2\n\t"
+ "movaps (%[src2]), %%xmm3\n\t"
+ "movaps (%[src3]), %%xmm4\n\t"
+
+ "pxor %%xmm5, %%xmm5\n\t"
+ "pxor %%xmm6, %%xmm6\n\t"
+ "movaps %%xmm1, %%xmm7\n\t"
+ "movaps %%xmm3, %%xmm8\n\t"
+ "psubw %%xmm2, %%xmm1\n\t"
+ "psubw %%xmm4, %%xmm3\n\t"
+
+ "pcmpgtw %%xmm1, %%xmm5\n\t"
+ "pcmpgtw %%xmm3, %%xmm6\n\t"
+
+ "pand %%xmm5, %%xmm2\n\t"
+ "pand %%xmm6, %%xmm4\n\t"
+ "pandn %%xmm7, %%xmm5\n\t"
+ "pandn %%xmm8, %%xmm6\n\t"
+
+ "paddw %%xmm2, %%xmm5\n\t"
+ "paddw %%xmm4, %%xmm6\n\t"
+
+ "pxor %%xmm1, %%xmm1\n\t"
+ "movaps %%xmm5, %%xmm2\n\t"
+
+ "psubw %%xmm6, %%xmm5\n\t"
+ "add $16, %[src0]\n\t"
+ "add $-1, %[bound]\n\t"
+
+ "pcmpgtw %%xmm5, %%xmm1\n\t"
+ "add $16, %[src1]\n\t"
+
+ "pand %%xmm1, %%xmm6\n\t"
+
+ "pandn %%xmm2, %%xmm1\n\t"
+ "add $16, %[src2]\n\t"
+
+ "paddw %%xmm6, %%xmm1\n\t"
+ "add $16, %[src3]\n\t"
+
+ "movaps %%xmm1, (%[target])\n\t"
+ "addw $16, %[target]\n\t"
+ "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t"
+
+ "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t"
+ :
+ :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target)
+ :
+ );
+ */
+
+ short temp0 = 0;
+ short temp1 = 0;
+ for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+ temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+ temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
+ target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
+ }
+ return;
+}
+
+#endif /*LV_HAVE_SSE2*/
+
+#ifdef LV_HAVE_NEON
+
+#include <arm_neon.h>
+
+static inline void
+volk_16i_x4_quad_max_star_16i_neon(short* target, short* src0, short* src1,
+ short* src2, short* src3, unsigned int num_points)
+{
+ const unsigned int eighth_points = num_points / 8;
+ unsigned i;
+
+ int16x8_t src0_vec, src1_vec, src2_vec, src3_vec;
+ int16x8_t diff12, diff34;
+ int16x8_t comp0, comp1, comp2, comp3;
+ int16x8_t result1_vec, result2_vec;
+ int16x8_t zeros;
+ zeros = veorq_s16(zeros, zeros);
+ for(i=0; i < eighth_points; ++i) {
+ src0_vec = vld1q_s16(src0);
+ src1_vec = vld1q_s16(src1);
+ src2_vec = vld1q_s16(src2);
+ src3_vec = vld1q_s16(src3);
+ diff12 = vsubq_s16(src0_vec, src1_vec);
+ diff34 = vsubq_s16(src2_vec, src3_vec);
+ comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
+ comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
+ comp2 = (int16x8_t)vcgeq_s16(diff34, zeros);
+ comp3 = (int16x8_t)vcltq_s16(diff34, zeros);
+ comp0 = vandq_s16(src0_vec, comp0);
+ comp1 = vandq_s16(src1_vec, comp1);
+ comp2 = vandq_s16(src2_vec, comp2);
+ comp3 = vandq_s16(src3_vec, comp3);
+
+ result1_vec = vaddq_s16(comp0, comp1);
+ result2_vec = vaddq_s16(comp2, comp3);
+
+ diff12 = vsubq_s16(result1_vec, result2_vec);
+ comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
+ comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
+ comp0 = vandq_s16(result1_vec, comp0);
+ comp1 = vandq_s16(result2_vec, comp1);
+ result1_vec = vaddq_s16(comp0, comp1);
+ vst1q_s16(target, result1_vec);
+ src0 += 8;
+ src1 += 8;
+ src2 += 8;
+ src3 += 8;
+ target += 8;
+ }
+
+ short temp0 = 0;
+ short temp1 = 0;
+ for(i=eighth_points*8; i < num_points; ++i) {
+ temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1;
+ temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3;
+ *target++ = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
+ src0++;
+ src1++;
+ src2++;
+ src3++;
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_16i_x4_quad_max_star_16i_generic(short* target, short* src0, short* src1,
+ short* src2, short* src3, unsigned int num_points)
+{
+ const unsigned int num_bytes = num_points*2;
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+ short temp0 = 0;
+ short temp1 = 0;
+ for(i = 0; i < bound; ++i) {
+ temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+ temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
+ target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_16i_x5_add_quad_16i_x4
+ *
+ * \b Overview
+ *
+ * <FIXME>
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16i_x5_add_quad_16i_x4(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points);
+ * \endcode
+ *
+ * \b Inputs
+ * \li src0: The input vector 0.
+ * \li src1: The input vector 1.
+ * \li src2: The input vector 2.
+ * \li src3: The input vector 3.
+ * \li src4: The input vector 4.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li target0: The output value 0.
+ * \li target1: The output value 1.
+ * \li target2: The output value 2.
+ * \li target3: The output value 3.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16i_x5_add_quad_16i_x4();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
+#define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
+
+#include<inttypes.h>
+#include<stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include<xmmintrin.h>
+#include<emmintrin.h>
+
+static inline void
+volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* target1, short* target2, short* target3,
+ short* src0, short* src1, short* src2, short* src3, short* src4,
+ unsigned int num_points)
+{
+ const unsigned int num_bytes = num_points*2;
+
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+ __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
+ p_target0 = (__m128i*)target0;
+ p_target1 = (__m128i*)target1;
+ p_target2 = (__m128i*)target2;
+ p_target3 = (__m128i*)target3;
+
+ p_src0 = (__m128i*)src0;
+ p_src1 = (__m128i*)src1;
+ p_src2 = (__m128i*)src2;
+ p_src3 = (__m128i*)src3;
+ p_src4 = (__m128i*)src4;
+
+ int i = 0;
+
+ int bound = (num_bytes >> 4);
+ int leftovers = (num_bytes >> 1) & 7;
+
+ for(; i < bound; ++i) {
+ xmm0 = _mm_load_si128(p_src0);
+ xmm1 = _mm_load_si128(p_src1);
+ xmm2 = _mm_load_si128(p_src2);
+ xmm3 = _mm_load_si128(p_src3);
+ xmm4 = _mm_load_si128(p_src4);
+
+ p_src0 += 1;
+ p_src1 += 1;
+
+ xmm1 = _mm_add_epi16(xmm0, xmm1);
+ xmm2 = _mm_add_epi16(xmm0, xmm2);
+ xmm3 = _mm_add_epi16(xmm0, xmm3);
+ xmm4 = _mm_add_epi16(xmm0, xmm4);
+
+
+ p_src2 += 1;
+ p_src3 += 1;
+ p_src4 += 1;
+
+ _mm_store_si128(p_target0, xmm1);
+ _mm_store_si128(p_target1, xmm2);
+ _mm_store_si128(p_target2, xmm3);
+ _mm_store_si128(p_target3, xmm4);
+
+ p_target0 += 1;
+ p_target1 += 1;
+ p_target2 += 1;
+ p_target3 += 1;
+ }
+ /*asm volatile
+ (
+ ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t"
+ "cmp $0, %[bound]\n\t"
+ "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t"
+ "movaps (%[src0]), %%xmm1\n\t"
+ "movaps (%[src1]), %%xmm2\n\t"
+ "movaps (%[src2]), %%xmm3\n\t"
+ "movaps (%[src3]), %%xmm4\n\t"
+ "movaps (%[src4]), %%xmm5\n\t"
+ "add $16, %[src0]\n\t"
+ "add $16, %[src1]\n\t"
+ "add $16, %[src2]\n\t"
+ "add $16, %[src3]\n\t"
+ "add $16, %[src4]\n\t"
+ "paddw %%xmm1, %%xmm2\n\t"
+ "paddw %%xmm1, %%xmm3\n\t"
+ "paddw %%xmm1, %%xmm4\n\t"
+ "paddw %%xmm1, %%xmm5\n\t"
+ "add $-1, %[bound]\n\t"
+ "movaps %%xmm2, (%[target0])\n\t"
+ "movaps %%xmm3, (%[target1])\n\t"
+ "movaps %%xmm4, (%[target2])\n\t"
+ "movaps %%xmm5, (%[target3])\n\t"
+ "add $16, %[target0]\n\t"
+ "add $16, %[target1]\n\t"
+ "add $16, %[target2]\n\t"
+ "add $16, %[target3]\n\t"
+ "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t"
+ ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t"
+ :
+ :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3)
+ :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ */
+
+ for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+ target0[i] = src0[i] + src1[i];
+ target1[i] = src0[i] + src2[i];
+ target2[i] = src0[i] + src3[i];
+ target3[i] = src0[i] + src4[i];
+ }
+}
+#endif /*LV_HAVE_SSE2*/
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_16i_x5_add_quad_16i_x4_neon(short* target0, short* target1, short* target2, short* target3,
+ short* src0, short* src1, short* src2, short* src3, short* src4,
+ unsigned int num_points)
+{
+ const unsigned int eighth_points = num_points / 8;
+ unsigned int number = 0;
+
+ int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec;
+ int16x8_t target0_vec, target1_vec, target2_vec, target3_vec;
+ for(number = 0; number < eighth_points; ++number) {
+ src0_vec = vld1q_s16(src0);
+ src1_vec = vld1q_s16(src1);
+ src2_vec = vld1q_s16(src2);
+ src3_vec = vld1q_s16(src3);
+ src4_vec = vld1q_s16(src4);
+
+ target0_vec = vaddq_s16(src0_vec , src1_vec);
+ target1_vec = vaddq_s16(src0_vec , src2_vec);
+ target2_vec = vaddq_s16(src0_vec , src3_vec);
+ target3_vec = vaddq_s16(src0_vec , src4_vec);
+
+ vst1q_s16(target0, target0_vec);
+ vst1q_s16(target1, target1_vec);
+ vst1q_s16(target2, target2_vec);
+ vst1q_s16(target3, target3_vec);
+ src0 += 8;
+ src1 += 8;
+ src2 += 8;
+ src3 += 8;
+ src4 += 8;
+ target0 += 8;
+ target1 += 8;
+ target2 += 8;
+ target3 += 8;
+ }
+
+ for(number = eighth_points * 8; number < num_points; ++number) {
+ *target0++ = *src0 + *src1++;
+ *target1++ = *src0 + *src2++;
+ *target2++ = *src0 + *src3++;
+ *target3++ = *src0++ + *src4++;
+ }
+}
+
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_16i_x5_add_quad_16i_x4_generic(short* target0, short* target1, short* target2, short* target3,
+ short* src0, short* src1, short* src2, short* src3, short* src4,
+ unsigned int num_points)
+{
+ const unsigned int num_bytes = num_points*2;
+
+ int i = 0;
+
+ int bound = num_bytes >> 1;
+
+ for(i = 0; i < bound; ++i) {
+ target0[i] = src0[i] + src1[i];
+ target1[i] = src0[i] + src2[i];
+ target2[i] = src0[i] + src3[i];
+ target3[i] = src0[i] + src4[i];
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2016 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_16ic_convert_32fc
+ *
+ * \b Overview
+ *
+ * Converts a complex vector of 16-bits integer each component
+ * into a complex vector of 32-bits float each component.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16ic_convert_32fc(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The complex 16-bit integer input data buffer.
+ * \li num_points: The number of data values to be converted.
+ *
+ * \b Outputs
+ * \li outputVector: pointer to a vector holding the converted vector.
+ *
+ */
+
+
+#ifndef INCLUDED_volk_16ic_convert_32fc_H
+#define INCLUDED_volk_16ic_convert_32fc_H
+
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
+{
+ unsigned int i;
+ for(i = 0; i < num_points; i++)
+ {
+ outputVector[i] = lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i]));
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 2;
+
+ const lv_16sc_t* _in = inputVector;
+ lv_32fc_t* _out = outputVector;
+ __m128 a;
+ unsigned int i, number;
+
+ for(number = 0; number < sse_iters; number++)
+ {
+ a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+ _mm_store_ps((float*)_out, a);
+ _in += 2;
+ _out += 2;
+ }
+ for (i = 0; i < (num_points % 2); ++i)
+ {
+ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+ _in++;
+ }
+}
+
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 2;
+
+ const lv_16sc_t* _in = inputVector;
+ lv_32fc_t* _out = outputVector;
+ __m128 a;
+ unsigned int i, number;
+
+ for(number = 0; number < sse_iters; number++)
+ {
+ a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+ _mm_storeu_ps((float*)_out, a);
+ _in += 2;
+ _out += 2;
+ }
+ for (i = 0; i < (num_points % 2); ++i)
+ {
+ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+ _in++;
+ }
+}
+
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_16ic_convert_32fc_u_axv(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 4;
+
+ const lv_16sc_t* _in = inputVector;
+ lv_32fc_t* _out = outputVector;
+ __m256 a;
+ unsigned int i, number;
+
+ for(number = 0; number < sse_iters; number++)
+ {
+ a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+ _mm256_storeu_ps((float*)_out, a);
+ _in += 4;
+ _out += 4;
+ }
+ _mm256_zeroupper();
+ for (i = 0; i < (num_points % 4); ++i)
+ {
+ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+ _in++;
+ }
+}
+
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_16ic_convert_32fc_a_axv(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 4;
+
+ const lv_16sc_t* _in = inputVector;
+ lv_32fc_t* _out = outputVector;
+ __m256 a;
+ unsigned int i, number;
+
+ for(number = 0; number < sse_iters; number++)
+ {
+ a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+ _mm256_store_ps((float*)_out, a);
+ _in += 4;
+ _out += 4;
+ }
+ _mm256_zeroupper();
+ for (i = 0; i < (num_points % 4); ++i)
+ {
+ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+ _in++;
+ }
+}
+
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 2;
+
+ const lv_16sc_t* _in = inputVector;
+ lv_32fc_t* _out = outputVector;
+
+ int16x4_t a16x4;
+ int32x4_t a32x4;
+ float32x4_t f32x4;
+ unsigned int i, number;
+
+ for(number = 0; number < sse_iters; number++)
+ {
+ a16x4 = vld1_s16((const int16_t*)_in);
+ __VOLK_PREFETCH(_in + 4);
+ a32x4 = vmovl_s16(a16x4);
+ f32x4 = vcvtq_f32_s32(a32x4);
+ vst1q_f32((float32_t*)_out, f32x4);
+ _in += 2;
+ _out += 2;
+ }
+ for (i = 0; i < (num_points % 2); ++i)
+ {
+ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+ _in++;
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+#endif /* INCLUDED_volk_32fc_convert_16ic_H */
\ No newline at end of file
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_16ic_deinterleave_16i_x2
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex 16 bit vector into I & Q vector data.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data.
+ * \li qBuffer: The Q buffer output data.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16ic_deinterleave_16i_x2();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
+#define INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSSE3
+#include <tmmintrin.h>
+
+static inline void
+volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+
+ __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+ __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+ __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
+ __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+ __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+
+ iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2));
+ qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2));
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+ _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *int16ComplexVectorPtr++;
+ *qBufferPtr++ = *int16ComplexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void
+volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal;
+ __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+ __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+
+ iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0));
+
+ iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask));
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+ qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1));
+
+ qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1));
+
+ qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask));
+
+ _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+{
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ unsigned int number;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+
+extern void
+volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
+static inline void
+volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+{
+ volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+#endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_16ic_deinterleave_real_16i
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex 16 bit vector and returns the real (inphase) part of the signal.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16ic_deinterleave_real_16i(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16ic_deinterleave_real_16i();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16ic_deinterleave_real_16i_a_H
+#define INCLUDED_volk_16ic_deinterleave_real_16i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSSE3
+#include <tmmintrin.h>
+
+static inline void
+volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+
+ __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+ __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+ __m128i complexVal1, complexVal2, iOutputVal;
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+
+ complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+ complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+
+ iOutputVal = _mm_or_si128(complexVal1, complexVal2);
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+ iBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void
+volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ __m128i complexVal1, complexVal2, iOutputVal;
+ __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+ __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+
+ complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+ complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1));
+
+ iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask));
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+ iBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_16ic_deinterleave_real_16i_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_16ic_deinterleave_real_8i
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex 16 bit vector and returns the real
+ * (inphase) part of the signal as an 8-bit value.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16ic_deinterleave_real_8i(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data with 8-bit precision.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16ic_deinterleave_real_8i();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a_H
+#define INCLUDED_volk_16ic_deinterleave_real_8i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSSE3
+#include <tmmintrin.h>
+
+static inline void
+volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+ __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+ __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
+
+ unsigned int sixteenthPoints = num_points / 16;
+
+ for(number = 0; number < sixteenthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+
+ complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+
+ complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+ complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+
+ complexVal1 = _mm_or_si128(complexVal1, complexVal2);
+
+ complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
+ complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
+
+ complexVal3 = _mm_or_si128(complexVal3, complexVal4);
+
+
+ complexVal1 = _mm_srai_epi16(complexVal1, 8);
+ complexVal3 = _mm_srai_epi16(complexVal3, 8);
+
+ iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+ iBufferPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
+ int16ComplexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_16ic_deinterleave_real_8i_neon(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+{
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ unsigned int eighth_points = num_points / 8;
+ unsigned int number;
+
+ int16x8x2_t complexInput;
+ int8x8_t realOutput;
+ for(number = 0; number < eighth_points; number++){
+ complexInput = vld2q_s16(complexVectorPtr);
+ realOutput = vshrn_n_s16(complexInput.val[0], 8);
+ vst1_s8(iBufferPtr, realOutput);
+ complexVectorPtr += 16;
+ iBufferPtr += 8;
+ }
+
+ for(number = eighth_points*8; number < num_points; number++){
+ *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
+ complexVectorPtr++;
+ }
+}
+#endif
+
+#ifdef LV_HAVE_ORC
+
+extern void
+volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
+
+static inline void
+volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+{
+ volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_16ic_magnitude_16i
+ *
+ * \b Overview
+ *
+ * Computes the magnitude of the complexVector and stores the results
+ * in the magnitudeVector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16ic_magnitude_16i(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of samples.
+ *
+ * \b Outputs
+ * \li magnitudeVector: The magnitude of the complex values.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16ic_magnitude_16i();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16ic_magnitude_16i_a_H
+#define INCLUDED_volk_16ic_magnitude_16i_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+static inline void
+volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 vScalar = _mm_set_ps1(32768.0);
+ __m128 invScalar = _mm_set_ps1(1.0/32768.0);
+
+ __m128 cplxValue1, cplxValue2, result;
+
+ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+ inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+ inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+ inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result); // Square root the values
+
+ result = _mm_mul_ps(result, vScalar); // Scale the results
+
+ _mm_store_ps(outputFloatBuffer, result);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
+ const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
+ const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
+ *magnitudeVectorPtr++ = (int16_t)(val1Result);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 vScalar = _mm_set_ps1(32768.0);
+ __m128 invScalar = _mm_set_ps1(1.0/32768.0);
+
+ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+
+ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ cplxValue1 = _mm_load_ps(inputFloatBuffer);
+ complexVectorPtr += 4;
+
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ cplxValue2 = _mm_load_ps(inputFloatBuffer);
+ complexVectorPtr += 4;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result); // Square root the values
+
+ result = _mm_mul_ps(result, vScalar); // Scale the results
+
+ _mm_store_ps(outputFloatBuffer, result);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+ *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
+ const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
+ const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
+ *magnitudeVectorPtr++ = (int16_t)(val1Result);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_16ic_magnitude_16i_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
+{
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ const float scalar = 32768.0;
+ for(number = 0; number < num_points; number++){
+ float real = ((float)(*complexVectorPtr++)) / scalar;
+ float imag = ((float)(*complexVectorPtr++)) / scalar;
+ *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC_DISABLED
+extern void
+volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points);
+
+static inline void
+volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
+{
+ volk_16ic_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, 32768.0, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_magnitude_16i_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_16ic_s32f_deinterleave_32f_x2
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex 16 bit vector into I & Q vector data and
+ * returns the result as two vectors of floats that have been scaled.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ * \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector of 16-bit shorts.
+ * \li scalar: The value to be divided against each sample of the input complex vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The floating point I buffer output data.
+ * \li qBuffer: The floating point Q buffer output data.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16ic_s32f_deinterleave_32f_x2();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
+#define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline
+void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
+ const float scalar, unsigned int num_points)
+{
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ uint64_t number = 0;
+ const uint64_t quarterPoints = num_points / 4;
+ __m128 cplxValue1, cplxValue2, iValue, qValue;
+
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
+
+ for(;number < quarterPoints; number++){
+
+ floatBuffer[0] = (float)(complexVectorPtr[0]);
+ floatBuffer[1] = (float)(complexVectorPtr[1]);
+ floatBuffer[2] = (float)(complexVectorPtr[2]);
+ floatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ floatBuffer[4] = (float)(complexVectorPtr[4]);
+ floatBuffer[5] = (float)(complexVectorPtr[5]);
+ floatBuffer[6] = (float)(complexVectorPtr[6]);
+ floatBuffer[7] = (float)(complexVectorPtr[7]);
+
+ cplxValue1 = _mm_load_ps(&floatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&floatBuffer[4]);
+
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ _mm_store_ps(iBufferPtr, iValue);
+ _mm_store_ps(qBufferPtr, qValue);
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ complexVectorPtr = (int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_16ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
+ const float scalar, unsigned int num_points)
+{
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+ unsigned int number;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void
+volk_16ic_s32f_deinterleave_32f_x2_neon(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
+ const float scalar, unsigned int num_points)
+{
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+ unsigned int eighth_points = num_points / 4;
+ unsigned int number;
+ float iScalar = 1.f/scalar;
+ float32x4_t invScalar;
+ invScalar = vld1q_dup_f32(&iScalar);
+
+ int16x4x2_t complexInput_s16;
+ int32x4x2_t complexInput_s32;
+ float32x4x2_t complexFloat;
+
+ for(number = 0; number < eighth_points; number++){
+ complexInput_s16 = vld2_s16(complexVectorPtr);
+ complexInput_s32.val[0] = vmovl_s16(complexInput_s16.val[0]);
+ complexInput_s32.val[1] = vmovl_s16(complexInput_s16.val[1]);
+ complexFloat.val[0] = vcvtq_f32_s32(complexInput_s32.val[0]);
+ complexFloat.val[1] = vcvtq_f32_s32(complexInput_s32.val[1]);
+ complexFloat.val[0] = vmulq_f32(complexFloat.val[0], invScalar);
+ complexFloat.val[1] = vmulq_f32(complexFloat.val[1], invScalar);
+ vst1q_f32(iBufferPtr, complexFloat.val[0]);
+ vst1q_f32(qBufferPtr, complexFloat.val[1]);
+ complexVectorPtr += 8;
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ for(number = eighth_points*4; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+extern void
+volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
+ const float scalar, unsigned int num_points);
+
+static inline void
+volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
+ const float scalar, unsigned int num_points)
+{
+ volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_16ic_s32f_deinterleave_real_32f
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex 16 bit vector and returns just the real
+ * part (inphase) of the data as a vector of floats that have been
+ * scaled.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+ * \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector of 16-bit shorts.
+ * \li scalar: The value to be divided against each sample of the input complex vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The floating point I buffer output data.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16ic_s32f_deinterleave_real_32f();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H
+#define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_16sc_t* complexVector,
+ const float scalar, unsigned int num_points)
+{
+ float* iBufferPtr = iBuffer;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 iFloatValue;
+
+ const float iScalar= 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ __m128i complexVal, iIntVal;
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+ __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+
+ for(;number < quarterPoints; number++){
+ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+ iIntVal = _mm_cvtepi16_epi32(complexVal);
+ iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+ _mm_store_ps(iBufferPtr, iFloatValue);
+
+ iBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
+ sixteenTComplexVectorPtr++;
+ }
+
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_16sc_t* complexVector,
+ const float scalar, unsigned int num_points)
+{
+ float* iBufferPtr = iBuffer;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 iValue;
+
+ const float iScalar = 1.0/scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+
+ iValue = _mm_load_ps(floatBuffer);
+
+ iValue = _mm_mul_ps(iValue, invScalar);
+
+ _mm_store_ps(iBufferPtr, iValue);
+
+ iBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ complexVectorPtr = (int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar;
+ complexVectorPtr++;
+ }
+
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_16ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_16sc_t* complexVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* iBufferPtr = iBuffer;
+ const float invScalar = 1.0 / scalar;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_16ic_s32f_magnitude_32f
+ *
+ * \b Overview
+ *
+ * Computes the magnitude of the complexVector and stores the results
+ * in the magnitudeVector as a scaled floating point number.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16ic_s32f_magnitude_32f(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector of complex 16-bit shorts.
+ * \li scalar: The value to be divided against each sample of the input complex vector.
+ * \li num_points: The number of samples.
+ *
+ * \b Outputs
+ * \li magnitudeVector: The magnitude of the complex values.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16ic_s32f_magnitude_32f();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
+#define INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+static inline void
+volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, const lv_16sc_t* complexVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+
+ __m128 cplxValue1, cplxValue2, result;
+
+ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
+
+ for(;number < quarterPoints; number++){
+
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+ inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+ inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+ inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result); // Square root the values
+
+ _mm_store_ps(magnitudeVectorPtr, result);
+
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ float val1Real = (float)(*complexVectorPtr++) / scalar;
+ float val1Imag = (float)(*complexVectorPtr++) / scalar;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, const lv_16sc_t* complexVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+
+ __m128 cplxValue1, cplxValue2, result, re, im;
+
+ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
+
+ for(;number < quarterPoints; number++){
+ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+ inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+ inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+ inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+ re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
+ im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
+
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm_mul_ps(re, invScalar);
+ cplxValue2 = _mm_mul_ps(im, invScalar);
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result); // Square root the values
+
+ _mm_store_ps(magnitudeVectorPtr, result);
+
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ complexVectorPtr = (const int16_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ float val1Real = (float)(*complexVectorPtr++) * iScalar;
+ float val1Imag = (float)(*complexVectorPtr++) * iScalar;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+
+
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector, const lv_16sc_t* complexVector,
+ const float scalar, unsigned int num_points)
+{
+ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ const float invScalar = 1.0 / scalar;
+ for(number = 0; number < num_points; number++){
+ float real = ( (float) (*complexVectorPtr++)) * invScalar;
+ float imag = ( (float) (*complexVectorPtr++)) * invScalar;
+ *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC_DISABLED
+
+extern void
+volk_16ic_s32f_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector,
+ const float scalar, unsigned int num_points);
+
+static inline void
+volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector, const lv_16sc_t* complexVector,
+ const float scalar, unsigned int num_points)
+{
+ volk_16ic_s32f_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2016 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_16ic_x2_dot_prod_16ic
+ *
+ * \b Overview
+ *
+ * Multiplies two input complex vectors (16-bit integer each component) and accumulates them,
+ * storing the result. Results are saturated so never go beyond the limits of the data type.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16ic_x2_dot_prod_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points);
+ * \endcode
+ *
+ * \b Inputs
+ * \li in_a: One of the vectors to be multiplied and accumulated.
+ * \li in_b: The other vector to be multiplied and accumulated.
+ * \li num_points: Number of complex values to be multiplied together, accumulated and stored into \p result
+ *
+ * \b Outputs
+ * \li result: Value of the accumulated result.
+ *
+ */
+
+#ifndef INCLUDED_volk_16ic_x2_dot_prod_16ic_H
+#define INCLUDED_volk_16ic_x2_dot_prod_16ic_H
+
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+#include <volk/saturation_arithmetic.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+{
+ result[0] = lv_cmake((int16_t)0, (int16_t)0);
+ unsigned int n;
+ for (n = 0; n < num_points; n++)
+ {
+ lv_16sc_t tmp = in_a[n] * in_b[n];
+ result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp) ));
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+{
+ lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
+
+ const unsigned int sse_iters = num_points / 4;
+ unsigned int number;
+
+ const lv_16sc_t* _in_a = in_a;
+ const lv_16sc_t* _in_b = in_b;
+ lv_16sc_t* _out = out;
+
+ if (sse_iters > 0)
+ {
+ __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc;
+ __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
+
+ realcacc = _mm_setzero_si128();
+ imagcacc = _mm_setzero_si128();
+
+ mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
+ mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
+
+ for(number = 0; number < sse_iters; number++)
+ {
+ // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
+ a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+ __VOLK_PREFETCH(_in_a + 8);
+ b = _mm_load_si128((__m128i*)_in_b);
+ __VOLK_PREFETCH(_in_b + 8);
+ c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
+
+ c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
+ real = _mm_subs_epi16(c, c_sr);
+
+ b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+ a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+
+ imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+ imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+ imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic!
+
+ realcacc = _mm_adds_epi16(realcacc, real);
+ imagcacc = _mm_adds_epi16(imagcacc, imag);
+
+ _in_a += 4;
+ _in_b += 4;
+ }
+
+ realcacc = _mm_and_si128(realcacc, mask_real);
+ imagcacc = _mm_and_si128(imagcacc, mask_imag);
+
+ a = _mm_or_si128(realcacc, imagcacc);
+
+ _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
+
+ for (number = 0; number < 4; ++number)
+ {
+ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
+ }
+ }
+
+ for (number = 0; number < (num_points % 4); ++number)
+ {
+ lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
+ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
+ }
+
+ *_out = dotProduct;
+}
+
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+{
+ lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
+
+ const unsigned int sse_iters = num_points / 4;
+
+ const lv_16sc_t* _in_a = in_a;
+ const lv_16sc_t* _in_b = in_b;
+ lv_16sc_t* _out = out;
+ unsigned int number;
+
+ if (sse_iters > 0)
+ {
+ __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
+ __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
+
+ realcacc = _mm_setzero_si128();
+ imagcacc = _mm_setzero_si128();
+
+ mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
+ mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
+
+ for(number = 0; number < sse_iters; number++)
+ {
+ // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
+ a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+ __VOLK_PREFETCH(_in_a + 8);
+ b = _mm_loadu_si128((__m128i*)_in_b);
+ __VOLK_PREFETCH(_in_b + 8);
+ c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
+
+ c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
+ real = _mm_subs_epi16(c, c_sr);
+
+ b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+ a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+
+ imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+ imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+ imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic!
+
+ realcacc = _mm_adds_epi16(realcacc, real);
+ imagcacc = _mm_adds_epi16(imagcacc, imag);
+
+ _in_a += 4;
+ _in_b += 4;
+ }
+
+ realcacc = _mm_and_si128(realcacc, mask_real);
+ imagcacc = _mm_and_si128(imagcacc, mask_imag);
+
+ result = _mm_or_si128(realcacc, imagcacc);
+
+ _mm_storeu_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector
+
+ for (number = 0; number < 4; ++number)
+ {
+ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
+ }
+ }
+
+ for (number = 0; number < (num_points % 4); ++number)
+ {
+ lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
+ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
+ }
+
+ *_out = dotProduct;
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+{
+ lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
+
+ const unsigned int avx_iters = num_points / 8;
+
+ const lv_16sc_t* _in_a = in_a;
+ const lv_16sc_t* _in_b = in_b;
+ lv_16sc_t* _out = out;
+ unsigned int number;
+
+ if (avx_iters > 0)
+ {
+ __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
+ __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
+
+ realcacc = _mm256_setzero_si256();
+ imagcacc = _mm256_setzero_si256();
+
+ mask_imag = _mm256_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
+ mask_real = _mm256_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
+
+ for(number = 0; number < avx_iters; number++)
+ {
+ a = _mm256_loadu_si256((__m256i*)_in_a);
+ __VOLK_PREFETCH(_in_a + 16);
+ b = _mm256_loadu_si256((__m256i*)_in_b);
+ __VOLK_PREFETCH(_in_b + 16);
+ c = _mm256_mullo_epi16(a, b);
+
+ c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
+ real = _mm256_subs_epi16(c, c_sr);
+
+ b_sl = _mm256_slli_si256(b, 2);
+ a_sl = _mm256_slli_si256(a, 2);
+
+ imag1 = _mm256_mullo_epi16(a, b_sl);
+ imag2 = _mm256_mullo_epi16(b, a_sl);
+
+ imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic!
+
+ realcacc = _mm256_adds_epi16(realcacc, real);
+ imagcacc = _mm256_adds_epi16(imagcacc, imag);
+
+ _in_a += 8;
+ _in_b += 8;
+ }
+
+ realcacc = _mm256_and_si256(realcacc, mask_real);
+ imagcacc = _mm256_and_si256(imagcacc, mask_imag);
+
+ result = _mm256_or_si256(realcacc, imagcacc);
+
+ _mm256_storeu_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
+ _mm256_zeroupper();
+
+ for (number = 0; number < 8; ++number)
+ {
+ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
+ }
+ }
+
+ for (number = 0; number < (num_points % 8); ++number)
+ {
+ lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
+ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
+ }
+
+ *_out = dotProduct;
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+{
+ lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
+
+ const unsigned int avx_iters = num_points / 8;
+
+ const lv_16sc_t* _in_a = in_a;
+ const lv_16sc_t* _in_b = in_b;
+ lv_16sc_t* _out = out;
+ unsigned int number;
+
+ if (avx_iters > 0)
+ {
+ __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
+ __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
+
+ realcacc = _mm256_setzero_si256();
+ imagcacc = _mm256_setzero_si256();
+
+ mask_imag = _mm256_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
+ mask_real = _mm256_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
+
+ for(number = 0; number < avx_iters; number++)
+ {
+ a = _mm256_load_si256((__m256i*)_in_a);
+ __VOLK_PREFETCH(_in_a + 16);
+ b = _mm256_load_si256((__m256i*)_in_b);
+ __VOLK_PREFETCH(_in_b + 16);
+ c = _mm256_mullo_epi16(a, b);
+
+ c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
+ real = _mm256_subs_epi16(c, c_sr);
+
+ b_sl = _mm256_slli_si256(b, 2);
+ a_sl = _mm256_slli_si256(a, 2);
+
+ imag1 = _mm256_mullo_epi16(a, b_sl);
+ imag2 = _mm256_mullo_epi16(b, a_sl);
+
+ imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic!
+
+ realcacc = _mm256_adds_epi16(realcacc, real);
+ imagcacc = _mm256_adds_epi16(imagcacc, imag);
+
+ _in_a += 8;
+ _in_b += 8;
+ }
+
+ realcacc = _mm256_and_si256(realcacc, mask_real);
+ imagcacc = _mm256_and_si256(imagcacc, mask_imag);
+
+ result = _mm256_or_si256(realcacc, imagcacc);
+
+ _mm256_store_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
+ _mm256_zeroupper();
+
+ for (number = 0; number < 8; ++number)
+ {
+ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
+ }
+ }
+
+ for (number = 0; number < (num_points % 8); ++number)
+ {
+ lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
+ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
+ }
+
+ *_out = dotProduct;
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+{
+ unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+
+ lv_16sc_t* a_ptr = (lv_16sc_t*) in_a;
+ lv_16sc_t* b_ptr = (lv_16sc_t*) in_b;
+ *out = lv_cmake((int16_t)0, (int16_t)0);
+
+ if (quarter_points > 0)
+ {
+ // for 2-lane vectors, 1st lane holds the real part,
+ // 2nd lane holds the imaginary part
+ int16x4x2_t a_val, b_val, c_val, accumulator;
+ int16x4x2_t tmp_real, tmp_imag;
+ __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
+ accumulator.val[0] = vdup_n_s16(0);
+ accumulator.val[1] = vdup_n_s16(0);
+ lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
+
+ for(number = 0; number < quarter_points; ++number)
+ {
+ a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ __VOLK_PREFETCH(a_ptr + 8);
+ __VOLK_PREFETCH(b_ptr + 8);
+
+ // multiply the real*real and imag*imag to get real result
+ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+ tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
+ // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+ tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
+
+ // Multiply cross terms to get the imaginary result
+ // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+ tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
+ // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+ tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
+
+ c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]);
+ c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
+
+ accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]);
+ accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]);
+
+ a_ptr += 4;
+ b_ptr += 4;
+ }
+
+ vst2_s16((int16_t*)accum_result, accumulator);
+ for (number = 0; number < 4; ++number)
+ {
+ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[number])));
+ }
+
+ *out = dotProduct;
+ }
+
+ // tail case
+ for(number = quarter_points * 4; number < num_points; ++number)
+ {
+ *out += (*a_ptr++) * (*b_ptr++);
+ }
+}
+
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+{
+ unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+
+ lv_16sc_t* a_ptr = (lv_16sc_t*) in_a;
+ lv_16sc_t* b_ptr = (lv_16sc_t*) in_b;
+ // for 2-lane vectors, 1st lane holds the real part,
+ // 2nd lane holds the imaginary part
+ int16x4x2_t a_val, b_val, accumulator;
+ int16x4x2_t tmp;
+ __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
+ accumulator.val[0] = vdup_n_s16(0);
+ accumulator.val[1] = vdup_n_s16(0);
+
+ for(number = 0; number < quarter_points; ++number)
+ {
+ a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ __VOLK_PREFETCH(a_ptr + 8);
+ __VOLK_PREFETCH(b_ptr + 8);
+
+ tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
+ tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
+
+ // use multiply accumulate/subtract to get result
+ tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]);
+ tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]);
+
+ accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]);
+ accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]);
+
+ a_ptr += 4;
+ b_ptr += 4;
+ }
+
+ vst2_s16((int16_t*)accum_result, accumulator);
+ *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+ // tail case
+ for(number = quarter_points * 4; number < num_points; ++number)
+ {
+ *out += (*a_ptr++) * (*b_ptr++);
+ }
+}
+
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+{
+ unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+
+ lv_16sc_t* a_ptr = (lv_16sc_t*) in_a;
+ lv_16sc_t* b_ptr = (lv_16sc_t*) in_b;
+ // for 2-lane vectors, 1st lane holds the real part,
+ // 2nd lane holds the imaginary part
+ int16x4x2_t a_val, b_val, accumulator1, accumulator2;
+
+ __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
+ accumulator1.val[0] = vdup_n_s16(0);
+ accumulator1.val[1] = vdup_n_s16(0);
+ accumulator2.val[0] = vdup_n_s16(0);
+ accumulator2.val[1] = vdup_n_s16(0);
+
+ for(number = 0; number < quarter_points; ++number)
+ {
+ a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ __VOLK_PREFETCH(a_ptr + 8);
+ __VOLK_PREFETCH(b_ptr + 8);
+
+ // use 2 accumulators to remove inter-instruction data dependencies
+ accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]);
+ accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]);
+ accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]);
+ accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]);
+
+ a_ptr += 4;
+ b_ptr += 4;
+ }
+
+ accumulator1.val[0] = vqadd_s16(accumulator1.val[0], accumulator2.val[0]);
+ accumulator1.val[1] = vqadd_s16(accumulator1.val[1], accumulator2.val[1]);
+
+ vst2_s16((int16_t*)accum_result, accumulator1);
+ *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+ // tail case
+ for(number = quarter_points * 4; number < num_points; ++number)
+ {
+ *out += (*a_ptr++) * (*b_ptr++);
+ }
+}
+
+#endif /* LV_HAVE_NEON */
+
+#endif /*INCLUDED_volk_16ic_x2_dot_prod_16ic_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2016 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_16ic_x2_multiply_16ic
+ *
+ * \b Overview
+ *
+ * Multiplies two input complex vectors, point-by-point, storing the result in the third vector.
+ * WARNING: Saturation is not checked.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16ic_x2_multiply_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points);
+ * \endcode
+ *
+ * \b Inputs
+ * \li in_a: One of the vectors to be multiplied.
+ * \li in_b: The other vector to be multiplied.
+ * \li num_points: The number of complex data points to be multiplied from both input vectors.
+ *
+ * \b Outputs
+ * \li result: The vector where the results will be stored.
+ *
+ */
+
+#ifndef INCLUDED_volk_16ic_x2_multiply_16ic_H
+#define INCLUDED_volk_16ic_x2_multiply_16ic_H
+
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+{
+ unsigned int n;
+ for (n = 0; n < num_points; n++)
+ {
+ result[n] = in_a[n] * in_b[n];
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 4;
+ __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, result;
+
+ mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
+ mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
+
+ const lv_16sc_t* _in_a = in_a;
+ const lv_16sc_t* _in_b = in_b;
+ lv_16sc_t* _out = out;
+ unsigned int number;
+
+ for(number = 0; number < sse_iters; number++)
+ {
+ a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+ b = _mm_load_si128((__m128i*)_in_b);
+ c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
+
+ c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
+ real = _mm_subs_epi16 (c, c_sr);
+ real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
+
+ b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+ a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+
+ imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+ imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+ imag = _mm_adds_epi16(imag1, imag2);
+ imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
+
+ result = _mm_or_si128 (real, imag);
+
+ _mm_store_si128((__m128i*)_out, result);
+
+ _in_a += 4;
+ _in_b += 4;
+ _out += 4;
+ }
+
+ for (number = sse_iters * 4; number < num_points; ++number)
+ {
+ *_out++ = (*_in_a++) * (*_in_b++);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 4;
+ __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result;
+
+ mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
+ mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
+
+ const lv_16sc_t* _in_a = in_a;
+ const lv_16sc_t* _in_b = in_b;
+ lv_16sc_t* _out = out;
+ unsigned int number;
+
+ for(number = 0; number < sse_iters; number++)
+ {
+ a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+ b = _mm_loadu_si128((__m128i*)_in_b);
+ c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
+
+ c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
+ real = _mm_subs_epi16 (c, c_sr);
+ real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
+
+ b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+ a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+
+ imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+ imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+ imag = _mm_adds_epi16(imag1, imag2);
+ imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
+
+ result = _mm_or_si128 (real, imag);
+
+ _mm_storeu_si128((__m128i*)_out, result);
+
+ _in_a += 4;
+ _in_b += 4;
+ _out += 4;
+ }
+
+ for (number = sse_iters * 4; number < num_points; ++number)
+ {
+ *_out++ = (*_in_a++) * (*_in_b++);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int avx2_points = num_points / 8;
+
+ const lv_16sc_t* _in_a = in_a;
+ const lv_16sc_t* _in_b = in_b;
+ lv_16sc_t* _out = out;
+
+ __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
+
+ const __m256i mask_imag = _mm256_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
+ const __m256i mask_real = _mm256_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
+
+ for(;number < avx2_points; number++)
+ {
+ a = _mm256_loadu_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ b = _mm256_loadu_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
+ c = _mm256_mullo_epi16(a, b);
+
+ c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
+ real = _mm256_subs_epi16(c, c_sr);
+ real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
+
+ b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
+ a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
+
+ imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+ imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+ imag = _mm256_adds_epi16(imag1, imag2);
+ imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
+
+ result = _mm256_or_si256(real, imag);
+
+ _mm256_storeu_si256((__m256i*)_out, result);
+
+ _in_a += 8;
+ _in_b += 8;
+ _out += 8;
+ }
+ _mm256_zeroupper();
+ number = avx2_points * 8;
+ for(;number < num_points; number++)
+ {
+ *_out++ = (*_in_a++) * (*_in_b++);
+ }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int avx2_points = num_points / 8;
+
+ const lv_16sc_t* _in_a = in_a;
+ const lv_16sc_t* _in_b = in_b;
+ lv_16sc_t* _out = out;
+
+ __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
+
+ const __m256i mask_imag = _mm256_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
+ const __m256i mask_real = _mm256_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
+
+ for(;number < avx2_points; number++)
+ {
+ a = _mm256_load_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ b = _mm256_load_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
+ c = _mm256_mullo_epi16(a, b);
+
+ c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
+ real = _mm256_subs_epi16(c, c_sr);
+ real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
+
+ b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
+ a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
+
+ imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+ imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+ imag = _mm256_adds_epi16(imag1, imag2);
+ imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
+
+ result = _mm256_or_si256(real, imag);
+
+ _mm256_store_si256((__m256i*)_out, result);
+
+ _in_a += 8;
+ _in_b += 8;
+ _out += 8;
+ }
+ _mm256_zeroupper();
+ number = avx2_points * 8;
+ for(;number < num_points; number++)
+ {
+ *_out++ = (*_in_a++) * (*_in_b++);
+ }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+{
+ lv_16sc_t *a_ptr = (lv_16sc_t*) in_a;
+ lv_16sc_t *b_ptr = (lv_16sc_t*) in_b;
+ unsigned int quarter_points = num_points / 4;
+ int16x4x2_t a_val, b_val, c_val;
+ int16x4x2_t tmp_real, tmp_imag;
+ unsigned int number = 0;
+
+ for(number = 0; number < quarter_points; ++number)
+ {
+ a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ __VOLK_PREFETCH(a_ptr + 4);
+ __VOLK_PREFETCH(b_ptr + 4);
+
+ // multiply the real*real and imag*imag to get real result
+ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+ tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
+ // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+ tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
+
+ // Multiply cross terms to get the imaginary result
+ // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+ tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
+ // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+ tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
+
+ // store the results
+ c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]);
+ c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
+ vst2_s16((int16_t*)out, c_val);
+
+ a_ptr += 4;
+ b_ptr += 4;
+ out += 4;
+ }
+
+ for(number = quarter_points * 4; number < num_points; number++)
+ {
+ *out++ = (*a_ptr++) * (*b_ptr++);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+#endif /*INCLUDED_volk_16ic_x2_multiply_16ic_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_16u_byteswap
+ *
+ * \b Overview
+ *
+ * Byteswaps (in-place) an aligned vector of int16_t's.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16u_byteswap(uint16_t* intsToSwap, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li intsToSwap: The vector of data to byte swap.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li intsToSwap: returns as an in-place calculation.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * <FIXME>
+ *
+ * volk_16u_byteswap(x, N);
+ *
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16u_byteswap_u_H
+#define INCLUDED_volk_16u_byteswap_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_AVX2
+#include <immintrin.h>
+static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int number;
+
+ const unsigned int nPerSet = 16;
+ const uint64_t nSets = num_points / nPerSet;
+
+ uint16_t* inputPtr = (uint16_t*) intsToSwap;
+
+ const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30};
+
+ const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]);
+
+ for(number = 0; number < nSets; number++) {
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
+ const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
+
+ // Store the results
+ _mm256_store_si256((__m256i*)inputPtr, output);
+ inputPtr += nPerSet;
+ }
+
+ _mm256_zeroupper();
+
+ // Byteswap any remaining points:
+ for(number = nPerSet * nSets; number < num_points; number++) {
+ uint16_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#if LV_HAVE_AVX2
+#include <immintrin.h>
+static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int number;
+
+ const unsigned int nPerSet = 16;
+ const uint64_t nSets = num_points / nPerSet;
+
+ uint16_t* inputPtr = (uint16_t*) intsToSwap;
+
+ const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30};
+
+ const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]);
+
+ for (number = 0; number < nSets; number++) {
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
+ const __m256i output = _mm256_shuffle_epi8(input,myShuffle);
+
+ // Store the results
+ _mm256_storeu_si256((__m256i*)inputPtr, output);
+ inputPtr += nPerSet;
+ }
+
+ _mm256_zeroupper();
+
+ // Byteswap any remaining points:
+ for(number = nPerSet * nSets; number < num_points; number++) {
+ uint16_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int number = 0;
+ uint16_t* inputPtr = intsToSwap;
+ __m128i input, left, right, output;
+
+ const unsigned int eighthPoints = num_points / 8;
+ for(;number < eighthPoints; number++){
+ // Load the 16t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_loadu_si128((__m128i*)inputPtr);
+ // Do the two shifts
+ left = _mm_slli_epi16(input, 8);
+ right = _mm_srli_epi16(input, 8);
+ // Or the left and right halves together
+ output = _mm_or_si128(left, right);
+ // Store the results
+ _mm_storeu_si128((__m128i*)inputPtr, output);
+ inputPtr += 8;
+ }
+
+ // Byteswap any remaining points:
+ number = eighthPoints*8;
+ for(; number < num_points; number++){
+ uint16_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int point;
+ uint16_t* inputPtr = intsToSwap;
+ for(point = 0; point < num_points; point++){
+ uint16_t output = *inputPtr;
+ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+ *inputPtr = output;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_16u_byteswap_u_H */
+#ifndef INCLUDED_volk_16u_byteswap_a_H
+#define INCLUDED_volk_16u_byteswap_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int number = 0;
+ uint16_t* inputPtr = intsToSwap;
+ __m128i input, left, right, output;
+
+ const unsigned int eighthPoints = num_points / 8;
+ for(;number < eighthPoints; number++){
+ // Load the 16t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_load_si128((__m128i*)inputPtr);
+ // Do the two shifts
+ left = _mm_slli_epi16(input, 8);
+ right = _mm_srli_epi16(input, 8);
+ // Or the left and right halves together
+ output = _mm_or_si128(left, right);
+ // Store the results
+ _mm_store_si128((__m128i*)inputPtr, output);
+ inputPtr += 8;
+ }
+
+
+ // Byteswap any remaining points:
+ number = eighthPoints*8;
+ for(; number < num_points; number++){
+ uint16_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int number;
+ unsigned int eighth_points = num_points / 8;
+ uint16x8_t input, output;
+ uint16_t* inputPtr = intsToSwap;
+
+ for(number = 0; number < eighth_points; number++) {
+ input = vld1q_u16(inputPtr);
+ output = vsriq_n_u16(output, input, 8);
+ output = vsliq_n_u16(output, input, 8);
+ vst1q_u16(inputPtr, output);
+ inputPtr += 8;
+ }
+
+ for(number = eighth_points * 8; number < num_points; number++){
+ uint16_t output = *inputPtr;
+ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+ *inputPtr = output;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap, unsigned int num_points){
+ uint16_t* inputPtr = intsToSwap;
+ unsigned int number = 0;
+ unsigned int n16points = num_points / 16;
+
+ uint8x8x4_t input_table;
+ uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
+ uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
+
+ /* these magic numbers are used as byte-indeces in the LUT.
+ they are pre-computed to save time. A simple C program
+ can calculate them; for example for lookup01:
+ uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
+ for(ii=0; ii < 8; ++ii) {
+ index += ((uint64_t)(*(chars+ii))) << (ii*8);
+ }
+ */
+ int_lookup01 = vcreate_u8(1232017111498883080);
+ int_lookup23 = vcreate_u8(1376697457175036426);
+ int_lookup45 = vcreate_u8(1521377802851189772);
+ int_lookup67 = vcreate_u8(1666058148527343118);
+
+ for(number = 0; number < n16points; ++number){
+ input_table = vld4_u8((uint8_t*) inputPtr);
+ swapped_int01 = vtbl4_u8(input_table, int_lookup01);
+ swapped_int23 = vtbl4_u8(input_table, int_lookup23);
+ swapped_int45 = vtbl4_u8(input_table, int_lookup45);
+ swapped_int67 = vtbl4_u8(input_table, int_lookup67);
+ vst1_u8((uint8_t*)inputPtr, swapped_int01);
+ vst1_u8((uint8_t*)(inputPtr+4), swapped_int23);
+ vst1_u8((uint8_t*)(inputPtr+8), swapped_int45);
+ vst1_u8((uint8_t*)(inputPtr+12), swapped_int67);
+
+ inputPtr += 16;
+ }
+
+ for(number = n16points * 16; number < num_points; ++number){
+ uint16_t output = *inputPtr;
+ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+ *inputPtr = output;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int point;
+ uint16_t* inputPtr = intsToSwap;
+ for(point = 0; point < num_points; point++){
+ uint16_t output = *inputPtr;
+ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+ *inputPtr = output;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+
+extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, unsigned int num_points);
+static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points){
+ volk_16u_byteswap_a_orc_impl(intsToSwap, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16u_byteswap_a_H */
--- /dev/null
+#ifndef INCLUDED_volk_16u_byteswappuppet_16u_H
+#define INCLUDED_volk_16u_byteswappuppet_16u_H
+
+
+#include <stdint.h>
+#include <volk/volk_16u_byteswap.h>
+#include <string.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_16u_byteswappuppet_16u_generic(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){
+
+ volk_16u_byteswap_generic((uint16_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_NEON
+static inline void volk_16u_byteswappuppet_16u_neon(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){
+
+ volk_16u_byteswap_neon((uint16_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_NEON
+static inline void volk_16u_byteswappuppet_16u_neon_table(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){
+
+ volk_16u_byteswap_neon_table((uint16_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_16u_byteswappuppet_16u_u_sse2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){
+
+ volk_16u_byteswap_u_sse2((uint16_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_16u_byteswappuppet_16u_a_sse2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){
+
+ volk_16u_byteswap_a_sse2((uint16_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_AVX2
+static inline void volk_16u_byteswappuppet_16u_u_avx2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){
+
+ volk_16u_byteswap_u_avx2((uint16_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_AVX2
+static inline void volk_16u_byteswappuppet_16u_a_avx2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){
+
+ volk_16u_byteswap_a_avx2((uint16_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+
+}
+#endif
+
+#endif
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2015 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_8u_polarbutterfly_32f
+ *
+ * \b Overview
+ *
+ * decode butterfly for one bit in polar decoder graph.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * volk_32f_8u_polarbutterfly_32f(float* llrs, unsigned char* u,
+ * const int frame_size, const int frame_exp,
+ * const int stage, const int u_num, const int row)
+ * \endcode
+ *
+ * \b Inputs
+ * \li llrs: buffer with LLRs. contains received LLRs and already decoded LLRs.
+ * \li u: previously decoded bits
+ * \li frame_size: = 2 ^ frame_exp.
+ * \li frame_exp: power of 2 value for frame size.
+ * \li stage: value in range [0, frame_exp). start stage algorithm goes deeper.
+ * \li u_num: bit number currently to be decoded
+ * \li row: row in graph to start decoding.
+ *
+ * \b Outputs
+ * \li frame: necessary LLRs for bit [u_num] to be decoded
+ *
+ * \b Example
+ * \code
+ * int frame_exp = 10;
+ * int frame_size = 0x01 << frame_exp;
+ *
+ * float* llrs = (float*) volk_malloc(sizeof(float) * frame_size * (frame_exp + 1), volk_get_alignment());
+ * unsigned char* u = (unsigned char) volk_malloc(sizeof(unsigned char) * frame_size * (frame_exp + 1), volk_get_alignment());
+ *
+ * {some_function_to_write_encoded_bits_to_float_llrs(llrs + frame_size * frame_exp, data)};
+ *
+ * unsigned int u_num;
+ * for(u_num = 0; u_num < frame_size; u_num++){
+ * volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_size, frame_exp, 0, u_num, u_num);
+ * // next line could first search for frozen bit value and then do bit decision.
+ * u[u_num] = llrs[u_num] > 0 ? 0 : 1;
+ * }
+ *
+ * volk_free(llrs);
+ * volk_free(u);
+ * \endcode
+ */
+
+#ifndef VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_
+#define VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_
+#include <math.h>
+#include <volk/volk_8u_x2_encodeframepolar_8u.h>
+
+static inline float
+llr_odd(const float la, const float lb)
+{
+ const float ala = fabs(la);
+ const float alb = fabs(lb);
+ return copysignf(1.0f, la) * copysignf(1.0f, lb) * (ala > alb ? alb : ala);
+}
+
+static inline void
+llr_odd_stages(float* llrs, int min_stage, const int depth, const int frame_size, const int row)
+{
+ int loop_stage = depth - 1;
+ float* dst_llr_ptr;
+ float* src_llr_ptr;
+ int stage_size = 0x01 << loop_stage;
+
+ int el;
+ while(min_stage <= loop_stage){
+ dst_llr_ptr = llrs + loop_stage * frame_size + row;
+ src_llr_ptr = dst_llr_ptr + frame_size;
+ for(el = 0; el < stage_size; el++){
+ *dst_llr_ptr++ = llr_odd(*src_llr_ptr, *(src_llr_ptr + 1));
+ src_llr_ptr += 2;
+ }
+
+ --loop_stage;
+ stage_size >>= 1;
+ }
+}
+
+static inline float
+llr_even(const float la, const float lb, const unsigned char f)
+{
+ switch(f){
+ case 0:
+ return lb + la;
+ default:
+ return lb - la;
+ }
+}
+
+static inline void
+even_u_values(unsigned char* u_even, const unsigned char* u, const int u_num)
+{
+ u++;
+ int i;
+ for(i = 1; i < u_num; i += 2){
+ *u_even++ = *u;
+ u += 2;
+ }
+}
+
+static inline void
+odd_xor_even_values(unsigned char* u_xor, const unsigned char* u, const int u_num)
+{
+ int i;
+ for(i = 1; i < u_num; i += 2){
+ *u_xor++ = *u ^ *(u + 1);
+ u += 2;
+ }
+}
+
+static inline int
+calculate_max_stage_depth_for_row(const int frame_exp, const int row)
+{
+ int max_stage_depth = 0;
+ int half_stage_size = 0x01;
+ int stage_size = half_stage_size << 1;
+ while(max_stage_depth < (frame_exp - 1)){ // last stage holds received values.
+ if(!(row % stage_size < half_stage_size)){
+ break;
+ }
+ half_stage_size <<= 1;
+ stage_size <<= 1;
+ max_stage_depth++;
+ }
+ return max_stage_depth;
+}
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_8u_polarbutterfly_32f_generic(float* llrs, unsigned char* u,
+ const int frame_size, const int frame_exp,
+ const int stage, const int u_num, const int row)
+{
+ const int next_stage = stage + 1;
+
+ const int half_stage_size = 0x01 << stage;
+ const int stage_size = half_stage_size << 1;
+
+ const bool is_upper_stage_half = row % stage_size < half_stage_size;
+
+// // this is a natural bit order impl
+ float* next_llrs = llrs + frame_size;// LLRs are stored in a consecutive array.
+ float* call_row_llr = llrs + row;
+
+ const int section = row - (row % stage_size);
+ const int jump_size = ((row % half_stage_size) << 1) % stage_size;
+
+ const int next_upper_row = section + jump_size;
+ const int next_lower_row = next_upper_row + 1;
+
+ const float* upper_right_llr_ptr = next_llrs + next_upper_row;
+ const float* lower_right_llr_ptr = next_llrs + next_lower_row;
+
+ if(!is_upper_stage_half){
+ const int u_pos = u_num >> stage;
+ const unsigned char f = u[u_pos - 1];
+ *call_row_llr = llr_even(*upper_right_llr_ptr, *lower_right_llr_ptr, f);
+ return;
+ }
+
+ if(frame_exp > next_stage){
+ unsigned char* u_half = u + frame_size;
+ odd_xor_even_values(u_half, u, u_num);
+ volk_32f_8u_polarbutterfly_32f_generic(next_llrs, u_half, frame_size, frame_exp, next_stage, u_num, next_upper_row);
+
+ even_u_values(u_half, u, u_num);
+ volk_32f_8u_polarbutterfly_32f_generic(next_llrs, u_half, frame_size, frame_exp, next_stage, u_num, next_lower_row);
+ }
+
+ *call_row_llr = llr_odd(*upper_right_llr_ptr, *lower_right_llr_ptr);
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+/*
+ * https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
+ * lists '__m256 _mm256_loadu2_m128 (float const* hiaddr, float const* loaddr)'.
+ * But GCC 4.8.4 doesn't know about it. Or headers are missing or something. Anyway, it doesn't compile :(
+ * This is what I want: llr0 = _mm256_loadu2_m128(src_llr_ptr, src_llr_ptr + 8);
+ * also useful but missing: _mm256_set_m128(hi, lo)
+ */
+
+static inline void
+volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs, unsigned char* u,
+ const int frame_size, const int frame_exp,
+ const int stage, const int u_num, const int row)
+{
+ if(row % 2){ // for odd rows just do the only necessary calculation and return.
+ const float* next_llrs = llrs + frame_size + row;
+ *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
+ return;
+ }
+
+ const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
+ if(max_stage_depth < 3){ // vectorized version needs larger vectors.
+ volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_size, frame_exp, stage, u_num, row);
+ return;
+ }
+
+ int loop_stage = max_stage_depth;
+ int stage_size = 0x01 << loop_stage;
+
+ float* src_llr_ptr;
+ float* dst_llr_ptr;
+
+ __m256 src0, src1, dst;
+ __m256 part0, part1;
+ __m256 llr0, llr1;
+
+ if(row){ // not necessary for ZERO row. == first bit to be decoded.
+ // first do bit combination for all stages
+ // effectively encode some decoded bits again.
+ unsigned char* u_target = u + frame_size;
+ unsigned char* u_temp = u + 2* frame_size;
+ memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
+
+ if(stage_size > 15){
+ _mm256_zeroupper();
+ volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
+ }
+ else{
+ volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
+ }
+
+ src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
+ dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
+
+ const __m128i zeros = _mm_set1_epi8(0x00);
+ const __m128i sign_extract = _mm_set1_epi8(0x80);
+ const __m128i shuffle_mask0 = _mm_setr_epi8(0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x01, 0xff, 0xff, 0xff, 0x02, 0xff, 0xff, 0xff, 0x03);
+ const __m128i shuffle_mask1 = _mm_setr_epi8(0xff, 0xff, 0xff, 0x04, 0xff, 0xff, 0xff, 0x05, 0xff, 0xff, 0xff, 0x06, 0xff, 0xff, 0xff, 0x07);
+ __m128i fbits, sign_bits0, sign_bits1;
+
+ __m256 sign_mask;
+
+ int p;
+ for(p = 0; p < stage_size; p += 8){
+ _mm256_zeroupper();
+ fbits = _mm_loadu_si128((__m128i*) u_target);
+ u_target += 8;
+
+ // prepare sign mask for correct +-
+ fbits = _mm_cmpgt_epi8(fbits, zeros);
+ fbits = _mm_and_si128(fbits, sign_extract);
+ sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0);
+ sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1);
+
+
+ src0 = _mm256_loadu_ps(src_llr_ptr);
+ src1 = _mm256_loadu_ps(src_llr_ptr + 8);
+ src_llr_ptr += 16;
+
+ sign_mask = _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits0), 0x0);
+ sign_mask = _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1);
+
+ // deinterleave values
+ part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
+ part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
+ llr0 = _mm256_shuffle_ps(part0, part1, 0x88);
+ llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
+
+ // calculate result
+ llr0 = _mm256_xor_ps(llr0, sign_mask);
+ dst = _mm256_add_ps(llr0, llr1);
+
+ _mm256_storeu_ps(dst_llr_ptr, dst);
+ dst_llr_ptr += 8;
+ }
+
+ --loop_stage;
+ stage_size >>= 1;
+ }
+
+ const int min_stage = stage > 2 ? stage : 2;
+ const __m256 sign_mask = _mm256_set1_ps(-0.0);
+ const __m256 abs_mask = _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
+ __m256 sign;
+
+ int el;
+ while(min_stage < loop_stage){
+ dst_llr_ptr = llrs + loop_stage * frame_size + row;
+ src_llr_ptr = dst_llr_ptr + frame_size;
+ for(el = 0; el < stage_size; el += 8){
+ src0 = _mm256_loadu_ps(src_llr_ptr);
+ src_llr_ptr += 8;
+ src1 = _mm256_loadu_ps(src_llr_ptr);
+ src_llr_ptr += 8;
+
+ // deinterleave values
+ part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
+ part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
+ llr0 = _mm256_shuffle_ps(part0, part1, 0x88);
+ llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
+
+ // calculate result
+ sign = _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
+ dst = _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
+ dst = _mm256_or_ps(dst, sign);
+
+ _mm256_storeu_ps(dst_llr_ptr, dst);
+ dst_llr_ptr += 8;
+ }
+
+ --loop_stage;
+ stage_size >>= 1;
+
+ }
+
+ // for stages < 3 vectors are too small!.
+ llr_odd_stages(llrs, stage, loop_stage + 1,frame_size, row);
+}
+
+#endif /* LV_HAVE_AVX */
+
+#endif /* VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_ */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2015 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * This puppet is for VOLK tests only.
+ * For documentation see 'kernels/volk/volk_32f_8u_polarbutterfly_32f.h'
+ */
+
+#ifndef VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLYPUPPET_32F_H_
+#define VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLYPUPPET_32F_H_
+
+#include <volk/volk_32f_8u_polarbutterfly_32f.h>
+#include <volk/volk_8u_x3_encodepolar_8u_x2.h>
+#include <volk/volk_8u_x3_encodepolarpuppet_8u.h>
+
+
+static inline void
+sanitize_bytes(unsigned char* u, const int elements)
+{
+ int i;
+ unsigned char* u_ptr = u;
+ for(i = 0; i < elements; i++){
+ *u_ptr = (*u_ptr & 0x01);
+ u_ptr++;
+ }
+}
+
+static inline void
+clean_up_intermediate_values(float* llrs, unsigned char* u, const int frame_size, const int elements)
+{
+ memset(u + frame_size, 0, sizeof(unsigned char) * (elements - frame_size));
+ memset(llrs + frame_size, 0, sizeof(float) * (elements - frame_size));
+}
+
+static inline void
+generate_error_free_input_vector(float* llrs, unsigned char* u, const int frame_size)
+{
+ memset(u, 0, frame_size);
+ unsigned char* target = u + frame_size;
+ volk_8u_x2_encodeframepolar_8u_generic(target, u + 2 * frame_size, frame_size);
+ float* ft = llrs;
+ int i;
+ for(i = 0; i < frame_size; i++){
+ *ft = (-2 * ((float) *target++)) + 1.0f;
+ ft++;
+ }
+}
+
+static inline void
+print_llr_tree(const float* llrs, const int frame_size, const int frame_exp)
+{
+ int s, e;
+ for(s = 0; s < frame_size; s++){
+ for(e = 0; e < frame_exp + 1; e++){
+ printf("%+4.2f ", llrs[e * frame_size + s]);
+ }
+ printf("\n");
+ if((s + 1) % 8 == 0){
+ printf("\n");
+ }
+ }
+}
+
+static inline int
+maximum_frame_size(const int elements)
+{
+ unsigned int frame_size = next_lower_power_of_two(elements);
+ unsigned int frame_exp = log2_of_power_of_2(frame_size);
+ return next_lower_power_of_two(frame_size / frame_exp);
+}
+
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_32f_8u_polarbutterflypuppet_32f_generic(float* llrs, const float* input, unsigned char* u, const int elements)
+{
+ unsigned int frame_size = maximum_frame_size(elements);
+ unsigned int frame_exp = log2_of_power_of_2(frame_size);
+
+ sanitize_bytes(u, elements);
+ generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
+
+ unsigned int u_num = 0;
+ for(; u_num < frame_size; u_num++){
+ volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_size, frame_exp, 0, u_num, u_num);
+ u[u_num] = llrs[u_num] > 0 ? 0 : 1;
+ }
+
+ clean_up_intermediate_values(llrs, u, frame_size, elements);
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_AVX
+static inline void
+volk_32f_8u_polarbutterflypuppet_32f_u_avx(float* llrs, const float* input, unsigned char* u, const int elements)
+{
+ unsigned int frame_size = maximum_frame_size(elements);
+ unsigned int frame_exp = log2_of_power_of_2(frame_size);
+
+ sanitize_bytes(u, elements);
+ generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
+
+ unsigned int u_num = 0;
+ for(; u_num < frame_size; u_num++){
+ volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_size, frame_exp, 0, u_num, u_num);
+ u[u_num] = llrs[u_num] > 0 ? 0 : 1;
+ }
+
+ clean_up_intermediate_values(llrs, u, frame_size, elements);
+}
+#endif /* LV_HAVE_AVX */
+
+
+
+#endif /* VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLYPUPPET_32F_H_ */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_accumulator_s32f
+ *
+ * \b Overview
+ *
+ * Accumulates the values in the input buffer.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_accumulator_s32f(float* result, const float* inputBuffer, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li inputBuffer The buffer of data to be accumulated
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li result The accumulated result.
+ *
+ * \b Example
+ * Calculate the sum of numbers 0 through 99
+ * \code
+ * int N = 100;
+ * unsigned int alignment = volk_get_alignment();
+ * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float), alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * increasing[ii] = (float)ii;
+ * }
+ *
+ * volk_32f_accumulator_s32f(out, increasing, N);
+ *
+ * printf("sum(1..100) = %1.2f\n", out[0]);
+ *
+ * volk_free(increasing);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_accumulator_s32f_a_H
+#define INCLUDED_volk_32f_accumulator_s32f_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigned int num_points)
+{
+ float returnValue = 0;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* aPtr = inputBuffer;
+ __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
+
+ __m128 accumulator = _mm_setzero_ps();
+ __m128 aVal = _mm_setzero_ps();
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_load_ps(aPtr);
+ accumulator = _mm_add_ps(accumulator, aVal);
+ aPtr += 4;
+ }
+
+ _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container
+
+ returnValue = tempBuffer[0];
+ returnValue += tempBuffer[1];
+ returnValue += tempBuffer[2];
+ returnValue += tempBuffer[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ returnValue += (*aPtr++);
+ }
+ *result = returnValue;
+}
+
+#endif /* LV_HAVE_SSE */
+
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points)
+{
+ const float* aPtr = inputBuffer;
+ unsigned int number = 0;
+ float returnValue = 0;
+
+ for(;number < num_points; number++){
+ returnValue += (*aPtr++);
+ }
+ *result = returnValue;
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_accumulator_s32f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_acos_32f
+ *
+ * \b Overview
+ *
+ * Computes arccosine of the input vector and stores results in the output vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_acos_32f(float* bVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li bVector: The vector where results will be stored.
+ *
+ * \b Example
+ * Calculate common angles around the top half of the unit circle.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ * in[0] = 1;
+ * in[1] = std::sqrt(3.f)/2.f;
+ * in[2] = std::sqrt(2.f)/2.f;
+ * in[3] = 0.5;
+ * in[4] = in[5] = 0;
+ * for(unsigned int ii = 6; ii < N; ++ii){
+ * in[ii] = - in[N-ii-1];
+ * }
+ *
+ * volk_32f_acos_32f(out, in, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("acos(%1.3f) = %1.3f\n", in[ii], out[ii]);
+ * }
+ *
+ * volk_free(in);
+ * volk_free(out);
+ * \endcode
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <inttypes.h>
+
+/* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/
+#define ACOS_TERMS 2
+
+#ifndef INCLUDED_volk_32f_acos_32f_a_H
+#define INCLUDED_volk_32f_acos_32f_a_H
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ int i, j;
+
+ __m128 aVal, d, pi, pio2, x, y, z, arccosine;
+ __m128 fzeroes, fones, ftwos, ffours, condition;
+
+ pi = _mm_set1_ps(3.14159265358979323846);
+ pio2 = _mm_set1_ps(3.14159265358979323846/2);
+ fzeroes = _mm_setzero_ps();
+ fones = _mm_set1_ps(1.0);
+ ftwos = _mm_set1_ps(2.0);
+ ffours = _mm_set1_ps(4.0);
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_load_ps(aPtr);
+ d = aVal;
+ aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
+ z = aVal;
+ condition = _mm_cmplt_ps(z, fzeroes);
+ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+ x = z;
+ condition = _mm_cmplt_ps(z, fones);
+ x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+ for(i = 0; i < 2; i++)
+ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+ x = _mm_div_ps(fones, x);
+ y = fzeroes;
+ for(j = ACOS_TERMS - 1; j >=0 ; j--)
+ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
+
+ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+ condition = _mm_cmpgt_ps(z, fones);
+
+ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+ arccosine = y;
+ condition = _mm_cmplt_ps(aVal, fzeroes);
+ arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
+ condition = _mm_cmplt_ps(d, fzeroes);
+ arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
+
+ _mm_store_ps(bPtr, arccosine);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = acos(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+#endif /* INCLUDED_volk_32f_acos_32f_a_H */
+
+
+#ifndef INCLUDED_volk_32f_acos_32f_u_H
+#define INCLUDED_volk_32f_acos_32f_u_H
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ int i, j;
+
+ __m128 aVal, d, pi, pio2, x, y, z, arccosine;
+ __m128 fzeroes, fones, ftwos, ffours, condition;
+
+ pi = _mm_set1_ps(3.14159265358979323846);
+ pio2 = _mm_set1_ps(3.14159265358979323846/2);
+ fzeroes = _mm_setzero_ps();
+ fones = _mm_set1_ps(1.0);
+ ftwos = _mm_set1_ps(2.0);
+ ffours = _mm_set1_ps(4.0);
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_loadu_ps(aPtr);
+ d = aVal;
+ aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
+ z = aVal;
+ condition = _mm_cmplt_ps(z, fzeroes);
+ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+ x = z;
+ condition = _mm_cmplt_ps(z, fones);
+ x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+ for(i = 0; i < 2; i++)
+ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+ x = _mm_div_ps(fones, x);
+ y = fzeroes;
+
+ for(j = ACOS_TERMS - 1; j >=0 ; j--)
+ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
+
+ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+ condition = _mm_cmpgt_ps(z, fones);
+
+ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+ arccosine = y;
+ condition = _mm_cmplt_ps(aVal, fzeroes);
+ arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
+ condition = _mm_cmplt_ps(d, fzeroes);
+ arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
+
+ _mm_storeu_ps(bPtr, arccosine);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = acos(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *bPtr++ = acos(*aPtr++);
+ }
+
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_acos_32f_u_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_asin_32f
+ *
+ * \b Overview
+ *
+ * Computes arcsine of input vector and stores results in output vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_asin_32f(float* bVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li bVector: The vector where results will be stored.
+ *
+ * \b Example
+ * \code
+ * Calculate common angles around the top half of the unit circle.
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ * in[0] = 0;
+ * in[1] = 0.5;
+ * in[2] = std::sqrt(2.f)/2.f;
+ * in[3] = std::sqrt(3.f)/2.f;
+ * in[4] = in[5] = 1;
+ * for(unsigned int ii = 6; ii < N; ++ii){
+ * in[ii] = - in[N-ii-1];
+ * }
+ *
+ * volk_32f_asin_32f(out, in, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("asin(%1.3f) = %1.3f\n", in[ii], out[ii]);
+ * }
+ *
+ * volk_free(in);
+ * volk_free(out);
+ * \endcode
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <inttypes.h>
+
+/* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/
+#define ASIN_TERMS 2
+
+#ifndef INCLUDED_volk_32f_asin_32f_a_H
+#define INCLUDED_volk_32f_asin_32f_a_H
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_asin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ int i, j;
+
+ __m128 aVal, pio2, x, y, z, arcsine;
+ __m128 fzeroes, fones, ftwos, ffours, condition;
+
+ pio2 = _mm_set1_ps(3.14159265358979323846/2);
+ fzeroes = _mm_setzero_ps();
+ fones = _mm_set1_ps(1.0);
+ ftwos = _mm_set1_ps(2.0);
+ ffours = _mm_set1_ps(4.0);
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_load_ps(aPtr);
+ aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
+ z = aVal;
+ condition = _mm_cmplt_ps(z, fzeroes);
+ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+ x = z;
+ condition = _mm_cmplt_ps(z, fones);
+ x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+ for(i = 0; i < 2; i++){
+ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+ }
+ x = _mm_div_ps(fones, x);
+ y = fzeroes;
+ for(j = ASIN_TERMS - 1; j >=0 ; j--){
+ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
+ }
+
+ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+ condition = _mm_cmpgt_ps(z, fones);
+
+ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+ arcsine = y;
+ condition = _mm_cmplt_ps(aVal, fzeroes);
+ arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
+
+ _mm_store_ps(bPtr, arcsine);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = asin(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+#endif /* INCLUDED_volk_32f_asin_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_asin_32f_u_H
+#define INCLUDED_volk_32f_asin_32f_u_H
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_asin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ int i, j;
+
+ __m128 aVal, pio2, x, y, z, arcsine;
+ __m128 fzeroes, fones, ftwos, ffours, condition;
+
+ pio2 = _mm_set1_ps(3.14159265358979323846/2);
+ fzeroes = _mm_setzero_ps();
+ fones = _mm_set1_ps(1.0);
+ ftwos = _mm_set1_ps(2.0);
+ ffours = _mm_set1_ps(4.0);
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_loadu_ps(aPtr);
+ aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
+ z = aVal;
+ condition = _mm_cmplt_ps(z, fzeroes);
+ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+ x = z;
+ condition = _mm_cmplt_ps(z, fones);
+ x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+ for(i = 0; i < 2; i++){
+ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+ }
+ x = _mm_div_ps(fones, x);
+ y = fzeroes;
+ for(j = ASIN_TERMS - 1; j >=0 ; j--){
+ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
+ }
+
+ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+ condition = _mm_cmpgt_ps(z, fones);
+
+ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+ arcsine = y;
+ condition = _mm_cmplt_ps(aVal, fzeroes);
+ arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
+
+ _mm_storeu_ps(bPtr, arcsine);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = asin(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE4_1 for unaligned */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_asin_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *bPtr++ = asin(*aPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_asin_32f_u_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_atan_32f
+ *
+ * \b Overview
+ *
+ * Computes arcsine of input vector and stores results in output vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_atan_32f(float* bVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li bVector: The vector where results will be stored.
+ *
+ * \b Example
+ * Calculate common angles around the top half of the unit circle.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ * in[0] = 0.f;
+ * in[1] = 1.f/std::sqrt(3.f);
+ * in[2] = 1.f;
+ * in[3] = std::sqrt(3.f);
+ * in[4] = in[5] = 1e99;
+ * for(unsigned int ii = 6; ii < N; ++ii){
+ * in[ii] = - in[N-ii-1];
+ * }
+ *
+ * volk_32f_atan_32f(out, in, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("atan(%1.3f) = %1.3f\n", in[ii], out[ii]);
+ * }
+ *
+ * volk_free(in);
+ * volk_free(out);
+ * \endcode
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <inttypes.h>
+
+/* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/
+#define TERMS 2
+
+#ifndef INCLUDED_volk_32f_atan_32f_a_H
+#define INCLUDED_volk_32f_atan_32f_a_H
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ int i, j;
+
+ __m128 aVal, pio2, x, y, z, arctangent;
+ __m128 fzeroes, fones, ftwos, ffours, condition;
+
+ pio2 = _mm_set1_ps(3.14159265358979323846/2);
+ fzeroes = _mm_setzero_ps();
+ fones = _mm_set1_ps(1.0);
+ ftwos = _mm_set1_ps(2.0);
+ ffours = _mm_set1_ps(4.0);
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_load_ps(aPtr);
+ z = aVal;
+ condition = _mm_cmplt_ps(z, fzeroes);
+ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+ x = z;
+ condition = _mm_cmplt_ps(z, fones);
+ x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+ for(i = 0; i < 2; i++){
+ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+ }
+ x = _mm_div_ps(fones, x);
+ y = fzeroes;
+ for(j = TERMS - 1; j >=0 ; j--){
+ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
+ }
+
+ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+ condition = _mm_cmpgt_ps(z, fones);
+
+ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+ arctangent = y;
+ condition = _mm_cmplt_ps(aVal, fzeroes);
+ arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
+
+ _mm_store_ps(bPtr, arctangent);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = atan(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+#endif /* INCLUDED_volk_32f_atan_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_atan_32f_u_H
+#define INCLUDED_volk_32f_atan_32f_u_H
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_atan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ int i, j;
+
+ __m128 aVal, pio2, x, y, z, arctangent;
+ __m128 fzeroes, fones, ftwos, ffours, condition;
+
+ pio2 = _mm_set1_ps(3.14159265358979323846/2);
+ fzeroes = _mm_setzero_ps();
+ fones = _mm_set1_ps(1.0);
+ ftwos = _mm_set1_ps(2.0);
+ ffours = _mm_set1_ps(4.0);
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_loadu_ps(aPtr);
+ z = aVal;
+ condition = _mm_cmplt_ps(z, fzeroes);
+ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+ x = z;
+ condition = _mm_cmplt_ps(z, fones);
+ x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+ for(i = 0; i < 2; i++)
+ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+ x = _mm_div_ps(fones, x);
+ y = fzeroes;
+ for(j = TERMS - 1; j >= 0; j--)
+ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
+
+ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+ condition = _mm_cmpgt_ps(z, fones);
+
+ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+ arctangent = y;
+ condition = _mm_cmplt_ps(aVal, fzeroes);
+ arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
+
+ _mm_storeu_ps(bPtr, arctangent);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = atan(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE4_1 for unaligned */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_atan_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *bPtr++ = atan(*aPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_atan_32f_u_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_binary_slicer_32i
+ *
+ * \b Overview
+ *
+ * Slices input floats and and returns 1 when the input >= 0 and 0
+ * when < 0. Results are returned as 32-bit ints.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_binary_slicer_32i(int* cVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li cVector: The output vector of 32-bit ints.
+ *
+ * \b Example
+ * Generate ints of a 7-bit barker code from floats.
+ * \code
+ * int N = 7;
+ * unsigned int alignment = volk_get_alignment();
+ * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * int32_t* out = (int32_t*)volk_malloc(sizeof(int32_t)*N, alignment);
+ *
+ * in[0] = 0.9f;
+ * in[1] = 1.1f;
+ * in[2] = 0.4f;
+ * in[3] = -0.7f;
+ * in[5] = -1.2f;
+ * in[6] = 0.2f;
+ * in[7] = -0.8f;
+ *
+ * volk_32f_binary_slicer_32i(out, in, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out(%i) = %i\n", ii, out[ii]);
+ * }
+ *
+ * volk_free(in);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_binary_slicer_32i_H
+#define INCLUDED_volk_32f_binary_slicer_32i_H
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_binary_slicer_32i_generic(int* cVector, const float* aVector, unsigned int num_points)
+{
+ int* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ if( *aPtr++ >= 0) {
+ *cPtr++ = 1;
+ }
+ else {
+ *cPtr++ = 0;
+ }
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_binary_slicer_32i_generic_branchless(int* cVector, const float* aVector, unsigned int num_points)
+{
+ int* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++ >= 0);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void
+volk_32f_binary_slicer_32i_a_sse2(int* cVector, const float* aVector, unsigned int num_points)
+{
+ int* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ unsigned int quarter_points = num_points / 4;
+ __m128 a_val, res_f;
+ __m128i res_i, binary_i;
+ __m128 zero_val;
+ zero_val = _mm_set1_ps (0.0f);
+
+ for(number = 0; number < quarter_points; number++){
+ a_val = _mm_load_ps(aPtr);
+
+ res_f = _mm_cmpge_ps (a_val, zero_val);
+ res_i = _mm_cvtps_epi32 (res_f);
+ binary_i = _mm_srli_epi32 (res_i, 31);
+
+ _mm_store_si128((__m128i*)cPtr, binary_i);
+
+ cPtr += 4;
+ aPtr += 4;
+ }
+
+ for(number = quarter_points * 4; number < num_points; number++){
+ if( *aPtr++ >= 0) {
+ *cPtr++ = 1;
+ }
+ else {
+ *cPtr++ = 0;
+ }
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_binary_slicer_32i_a_avx(int* cVector, const float* aVector, unsigned int num_points)
+{
+ int* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ unsigned int quarter_points = num_points / 8;
+ __m256 a_val, res_f, binary_f;
+ __m256i binary_i;
+ __m256 zero_val, one_val;
+ zero_val = _mm256_set1_ps (0.0f);
+ one_val = _mm256_set1_ps (1.0f);
+
+ for(number = 0; number < quarter_points; number++){
+ a_val = _mm256_load_ps(aPtr);
+
+ res_f = _mm256_cmp_ps (a_val, zero_val, 13);
+ binary_f = _mm256_and_ps (res_f, one_val);
+ binary_i = _mm256_cvtps_epi32(binary_f);
+
+ _mm256_store_si256((__m256i *)cPtr, binary_i);
+
+ cPtr += 8;
+ aPtr += 8;
+ }
+
+ for(number = quarter_points * 8; number < num_points; number++){
+ if( *aPtr++ >= 0) {
+ *cPtr++ = 1;
+ }
+ else {
+ *cPtr++ = 0;
+ }
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void
+volk_32f_binary_slicer_32i_u_sse2(int* cVector, const float* aVector, unsigned int num_points)
+{
+ int* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ unsigned int quarter_points = num_points / 4;
+ __m128 a_val, res_f;
+ __m128i res_i, binary_i;
+ __m128 zero_val;
+ zero_val = _mm_set1_ps (0.0f);
+
+ for(number = 0; number < quarter_points; number++){
+ a_val = _mm_loadu_ps(aPtr);
+
+ res_f = _mm_cmpge_ps (a_val, zero_val);
+ res_i = _mm_cvtps_epi32 (res_f);
+ binary_i = _mm_srli_epi32 (res_i, 31);
+
+ _mm_storeu_si128((__m128i*)cPtr, binary_i);
+
+ cPtr += 4;
+ aPtr += 4;
+ }
+
+ for(number = quarter_points * 4; number < num_points; number++){
+ if( *aPtr++ >= 0) {
+ *cPtr++ = 1;
+ }
+ else {
+ *cPtr++ = 0;
+ }
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_binary_slicer_32i_u_avx(int* cVector, const float* aVector, unsigned int num_points)
+{
+ int* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ unsigned int quarter_points = num_points / 8;
+ __m256 a_val, res_f, binary_f;
+ __m256i binary_i;
+ __m256 zero_val, one_val;
+ zero_val = _mm256_set1_ps (0.0f);
+ one_val = _mm256_set1_ps (1.0f);
+
+ for(number = 0; number < quarter_points; number++){
+ a_val = _mm256_loadu_ps(aPtr);
+
+ res_f = _mm256_cmp_ps (a_val, zero_val, 13);
+ binary_f = _mm256_and_ps (res_f, one_val);
+ binary_i = _mm256_cvtps_epi32(binary_f);
+
+ _mm256_storeu_si256((__m256i*)cPtr, binary_i);
+
+ cPtr += 8;
+ aPtr += 8;
+ }
+
+ for(number = quarter_points * 8; number < num_points; number++){
+ if( *aPtr++ >= 0) {
+ *cPtr++ = 1;
+ }
+ else {
+ *cPtr++ = 0;
+ }
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#endif /* INCLUDED_volk_32f_binary_slicer_32i_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_binary_slicer_8i
+ *
+ * \b Overview
+ *
+ * Slices input floats and and returns 1 when the input >= 0 and 0
+ * when < 0. Results are converted to 8-bit chars.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_binary_slicer_8i(int8_t* cVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li cVector: The output vector of 8-bit chars.
+ *
+ * \b Example
+ * Generate bytes of a 7-bit barker code from floats.
+ * \code
+ int N = 7;
+ unsigned int alignment = volk_get_alignment();
+ float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ int8_t* out = (int8_t*)volk_malloc(sizeof(int8_t)*N, alignment);
+
+ in[0] = 0.9f;
+ in[1] = 1.1f;
+ in[2] = 0.4f;
+ in[3] = -0.7f;
+ in[5] = -1.2f;
+ in[6] = 0.2f;
+ in[7] = -0.8f;
+
+ volk_32f_binary_slicer_8i(out, in, N);
+
+ for(unsigned int ii = 0; ii < N; ++ii){
+ printf("out(%i) = %i\n", ii, out[ii]);
+ }
+
+ volk_free(in);
+ volk_free(out);
+
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_binary_slicer_8i_H
+#define INCLUDED_volk_32f_binary_slicer_8i_H
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_binary_slicer_8i_generic(int8_t* cVector, const float* aVector,
+ unsigned int num_points)
+{
+ int8_t* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++) {
+ if(*aPtr++ >= 0) {
+ *cPtr++ = 1;
+ }
+ else {
+ *cPtr++ = 0;
+ }
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector, const float* aVector,
+ unsigned int num_points)
+{
+ int8_t* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++ >= 0);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void
+volk_32f_binary_slicer_8i_a_avx2(int8_t* cVector, const float* aVector,
+ unsigned int num_points)
+{
+ int8_t* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+ unsigned int n32points = num_points / 32;
+
+ const __m256 zero_val = _mm256_set1_ps(0.0f);
+ __m256 a0_val, a1_val, a2_val, a3_val;
+ __m256 res0_f, res1_f, res2_f, res3_f;
+ __m256i res0_i, res1_i, res2_i, res3_i;
+ __m256i byte_shuffle = _mm256_set_epi8( 15, 14, 13, 12, 7, 6, 5, 4,
+ 11, 10, 9, 8, 3, 2, 1, 0,
+ 15, 14, 13, 12, 7, 6, 5, 4,
+ 11, 10, 9, 8, 3, 2, 1, 0);
+
+ for(number = 0; number < n32points; number++) {
+ a0_val = _mm256_load_ps(aPtr);
+ a1_val = _mm256_load_ps(aPtr+8);
+ a2_val = _mm256_load_ps(aPtr+16);
+ a3_val = _mm256_load_ps(aPtr+24);
+
+ // compare >= 0; return float
+ res0_f = _mm256_cmp_ps(a0_val, zero_val, 13);
+ res1_f = _mm256_cmp_ps(a1_val, zero_val, 13);
+ res2_f = _mm256_cmp_ps(a2_val, zero_val, 13);
+ res3_f = _mm256_cmp_ps(a3_val, zero_val, 13);
+
+ // convert to 32i and >> 31
+ res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
+ res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
+ res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
+ res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
+
+ // pack in to 16-bit results
+ res0_i = _mm256_packs_epi32(res0_i, res1_i);
+ res2_i = _mm256_packs_epi32(res2_i, res3_i);
+ // pack in to 8-bit results
+ // res0: (after packs_epi32)
+ // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
+ // res2:
+ // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
+ res0_i = _mm256_packs_epi16(res0_i, res2_i);
+ // shuffle the lanes
+ // res0: (after packs_epi16)
+ // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
+ // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
+ // 0, 2, 1, 3 -> 11 01 10 00 (0xd8)
+ res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
+
+ // shuffle bytes within lanes
+ // res0: (after shuffle_epi8)
+ // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
+ // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
+ res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
+
+ _mm256_store_si256((__m256i*)cPtr, res0_i);
+ aPtr += 32;
+ cPtr += 32;
+ }
+
+ for(number = n32points * 32; number < num_points; number++) {
+ if( *aPtr++ >= 0) {
+ *cPtr++ = 1;
+ }
+ else {
+ *cPtr++ = 0;
+ }
+ }
+}
+#endif
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void
+volk_32f_binary_slicer_8i_u_avx2(int8_t* cVector, const float* aVector,
+ unsigned int num_points)
+{
+ int8_t* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+ unsigned int n32points = num_points / 32;
+
+ const __m256 zero_val = _mm256_set1_ps(0.0f);
+ __m256 a0_val, a1_val, a2_val, a3_val;
+ __m256 res0_f, res1_f, res2_f, res3_f;
+ __m256i res0_i, res1_i, res2_i, res3_i;
+ __m256i byte_shuffle = _mm256_set_epi8( 15, 14, 13, 12, 7, 6, 5, 4,
+ 11, 10, 9, 8, 3, 2, 1, 0,
+ 15, 14, 13, 12, 7, 6, 5, 4,
+ 11, 10, 9, 8, 3, 2, 1, 0);
+
+ for(number = 0; number < n32points; number++) {
+ a0_val = _mm256_loadu_ps(aPtr);
+ a1_val = _mm256_loadu_ps(aPtr+8);
+ a2_val = _mm256_loadu_ps(aPtr+16);
+ a3_val = _mm256_loadu_ps(aPtr+24);
+
+ // compare >= 0; return float
+ res0_f = _mm256_cmp_ps(a0_val, zero_val, 13);
+ res1_f = _mm256_cmp_ps(a1_val, zero_val, 13);
+ res2_f = _mm256_cmp_ps(a2_val, zero_val, 13);
+ res3_f = _mm256_cmp_ps(a3_val, zero_val, 13);
+
+ // convert to 32i and >> 31
+ res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
+ res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
+ res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
+ res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
+
+ // pack in to 16-bit results
+ res0_i = _mm256_packs_epi32(res0_i, res1_i);
+ res2_i = _mm256_packs_epi32(res2_i, res3_i);
+ // pack in to 8-bit results
+ // res0: (after packs_epi32)
+ // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
+ // res2:
+ // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
+ res0_i = _mm256_packs_epi16(res0_i, res2_i);
+ // shuffle the lanes
+ // res0: (after packs_epi16)
+ // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
+ // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
+ // 0, 2, 1, 3 -> 11 01 10 00 (0xd8)
+ res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
+
+ // shuffle bytes within lanes
+ // res0: (after shuffle_epi8)
+ // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
+ // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
+ res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
+
+ _mm256_storeu_si256((__m256i*)cPtr, res0_i);
+ aPtr += 32;
+ cPtr += 32;
+ }
+
+ for(number = n32points * 32; number < num_points; number++) {
+ if( *aPtr++ >= 0) {
+ *cPtr++ = 1;
+ }
+ else {
+ *cPtr++ = 0;
+ }
+ }
+}
+#endif
+
+
+
+#ifdef LV_HAVE_SSE2
+
+#include <emmintrin.h>
+
+static inline void
+volk_32f_binary_slicer_8i_a_sse2(int8_t* cVector, const float* aVector,
+ unsigned int num_points)
+{
+ int8_t* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ unsigned int n16points = num_points / 16;
+ __m128 a0_val, a1_val, a2_val, a3_val;
+ __m128 res0_f, res1_f, res2_f, res3_f;
+ __m128i res0_i, res1_i, res2_i, res3_i;
+ __m128 zero_val;
+ zero_val = _mm_set1_ps(0.0f);
+
+ for(number = 0; number < n16points; number++) {
+ a0_val = _mm_load_ps(aPtr);
+ a1_val = _mm_load_ps(aPtr+4);
+ a2_val = _mm_load_ps(aPtr+8);
+ a3_val = _mm_load_ps(aPtr+12);
+
+ // compare >= 0; return float
+ res0_f = _mm_cmpge_ps(a0_val, zero_val);
+ res1_f = _mm_cmpge_ps(a1_val, zero_val);
+ res2_f = _mm_cmpge_ps(a2_val, zero_val);
+ res3_f = _mm_cmpge_ps(a3_val, zero_val);
+
+ // convert to 32i and >> 31
+ res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
+ res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
+ res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
+ res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
+
+ // pack into 16-bit results
+ res0_i = _mm_packs_epi32(res0_i, res1_i);
+ res2_i = _mm_packs_epi32(res2_i, res3_i);
+
+ // pack into 8-bit results
+ res0_i = _mm_packs_epi16(res0_i, res2_i);
+
+ _mm_store_si128((__m128i*)cPtr, res0_i);
+
+ cPtr += 16;
+ aPtr += 16;
+ }
+
+ for(number = n16points * 16; number < num_points; number++) {
+ if( *aPtr++ >= 0) {
+ *cPtr++ = 1;
+ }
+ else {
+ *cPtr++ = 0;
+ }
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void
+volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector, const float* aVector,
+ unsigned int num_points)
+{
+ int8_t* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ unsigned int n16points = num_points / 16;
+ __m128 a0_val, a1_val, a2_val, a3_val;
+ __m128 res0_f, res1_f, res2_f, res3_f;
+ __m128i res0_i, res1_i, res2_i, res3_i;
+ __m128 zero_val;
+ zero_val = _mm_set1_ps (0.0f);
+
+ for(number = 0; number < n16points; number++) {
+ a0_val = _mm_loadu_ps(aPtr);
+ a1_val = _mm_loadu_ps(aPtr+4);
+ a2_val = _mm_loadu_ps(aPtr+8);
+ a3_val = _mm_loadu_ps(aPtr+12);
+
+ // compare >= 0; return float
+ res0_f = _mm_cmpge_ps(a0_val, zero_val);
+ res1_f = _mm_cmpge_ps(a1_val, zero_val);
+ res2_f = _mm_cmpge_ps(a2_val, zero_val);
+ res3_f = _mm_cmpge_ps(a3_val, zero_val);
+
+ // convert to 32i and >> 31
+ res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
+ res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
+ res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
+ res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
+
+ // pack into 16-bit results
+ res0_i = _mm_packs_epi32(res0_i, res1_i);
+ res2_i = _mm_packs_epi32(res2_i, res3_i);
+
+ // pack into 8-bit results
+ res0_i = _mm_packs_epi16(res0_i, res2_i);
+
+ _mm_storeu_si128((__m128i*)cPtr, res0_i);
+
+ cPtr += 16;
+ aPtr += 16;
+ }
+
+ for(number = n16points * 16; number < num_points; number++) {
+ if( *aPtr++ >= 0) {
+ *cPtr++ = 1;
+ }
+ else {
+ *cPtr++ = 0;
+ }
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32f_binary_slicer_8i_neon(int8_t* cVector, const float* aVector,
+ unsigned int num_points)
+{
+ int8_t* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+ unsigned int n16points = num_points / 16;
+
+ float32x4x2_t input_val0, input_val1;
+ float32x4_t zero_val;
+ uint32x4x2_t res0_u32, res1_u32;
+ uint16x4x2_t res0_u16x4, res1_u16x4;
+ uint16x8x2_t res_u16x8;
+ uint8x8x2_t res_u8;
+ uint8x8_t one;
+
+ zero_val = vdupq_n_f32(0.0);
+ one = vdup_n_u8(0x01);
+
+ // TODO: this is a good candidate for asm because the vcombines
+ // can be eliminated simply by picking dst registers that are
+ // adjacent.
+ for(number = 0; number < n16points; number++) {
+ input_val0 = vld2q_f32(aPtr);
+ input_val1 = vld2q_f32(aPtr+8);
+
+ // test against 0; return uint32
+ res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
+ res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
+ res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
+ res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
+
+ // narrow uint32 -> uint16 followed by combine to 8-element vectors
+ res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
+ res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
+ res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
+ res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
+
+ res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
+ res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
+
+ // narrow uint16x8 -> uint8x8
+ res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
+ res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
+ // we *could* load twice as much data and do another vcombine here
+ // to get a uint8x16x2 vector, still only do 2 vandqs and a single store
+ // but that turns out to be ~16% slower than this version on zc702
+ // it's possible register contention in GCC scheduler slows it down
+ // and a hand-written asm with quad-word u8 registers is much faster.
+
+ res_u8.val[0] = vand_u8(one, res_u8.val[0]);
+ res_u8.val[1] = vand_u8(one, res_u8.val[1]);
+
+ vst2_u8((unsigned char*)cPtr, res_u8);
+ cPtr += 16;
+ aPtr += 16;
+
+ }
+
+ for(number = n16points * 16; number < num_points; number++) {
+ if(*aPtr++ >= 0) {
+ *cPtr++ = 1;
+ }
+ else {
+ *cPtr++ = 0;
+ }
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#endif /* INCLUDED_volk_32f_binary_slicer_8i_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_convert_64f
+ *
+ * \b Overview
+ *
+ * Converts float values into doubles.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_convert_64f(double* outputVector, const float* inputVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The vector of floats to convert to doubles.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li outputVector: returns the converted doubles.
+ *
+ * \b Example
+ * Generate floats and convert them to doubles.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * double* out = (double*)volk_malloc(sizeof(double)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * in[ii] = (float)ii;
+ * }
+ *
+ * volk_32f_convert_64f(out, in, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out(%i) = %g\n", ii, out[ii]);
+ * }
+ *
+ * volk_free(in);
+ * volk_free(out);
+ * \endcode
+ */
+
+
+#ifndef INCLUDED_volk_32f_convert_64f_u_H
+#define INCLUDED_volk_32f_convert_64f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_convert_64f_u_avx(double* outputVector, const float* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ double* outputVectorPtr = outputVector;
+ __m256d ret;
+ __m128 inputVal;
+
+ for(;number < quarterPoints; number++){
+ inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ ret = _mm256_cvtps_pd(inputVal);
+ _mm256_storeu_pd(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (double)(inputVector[number]);
+ }
+}
+
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ double* outputVectorPtr = outputVector;
+ __m128d ret;
+ __m128 inputVal;
+
+ for(;number < quarterPoints; number++){
+ inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ ret = _mm_cvtps_pd(inputVal);
+
+ _mm_storeu_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+
+ inputVal = _mm_movehl_ps(inputVal, inputVal);
+
+ ret = _mm_cvtps_pd(inputVal);
+
+ _mm_storeu_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (double)(inputVector[number]);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_convert_64f_generic(double* outputVector, const float* inputVector, unsigned int num_points){
+ double* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_convert_64f_u_H */
+
+
+#ifndef INCLUDED_volk_32f_convert_64f_a_H
+#define INCLUDED_volk_32f_convert_64f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_convert_64f_a_avx(double* outputVector, const float* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ double* outputVectorPtr = outputVector;
+ __m256d ret;
+ __m128 inputVal;
+
+ for(;number < quarterPoints; number++){
+ inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ ret = _mm256_cvtps_pd(inputVal);
+ _mm256_store_pd(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (double)(inputVector[number]);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ double* outputVectorPtr = outputVector;
+ __m128d ret;
+ __m128 inputVal;
+
+ for(;number < quarterPoints; number++){
+ inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ ret = _mm_cvtps_pd(inputVal);
+
+ _mm_store_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+
+ inputVal = _mm_movehl_ps(inputVal, inputVal);
+
+ ret = _mm_cvtps_pd(inputVal);
+
+ _mm_store_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (double)(inputVector[number]);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_convert_64f_a_generic(double* outputVector, const float* inputVector, unsigned int num_points){
+ double* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_convert_64f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_cos_32f
+ *
+ * \b Overview
+ *
+ * Computes cosine of the input vector and stores results in the output vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_cos_32f(float* bVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li bVector: The vector where results will be stored.
+ *
+ * \b Example
+ * Calculate cos(theta) for common angles.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ * in[0] = 0.000;
+ * in[1] = 0.524;
+ * in[2] = 0.786;
+ * in[3] = 1.047;
+ * in[4] = 1.571;
+ * in[5] = 1.571;
+ * in[6] = 2.094;
+ * in[7] = 2.356;
+ * in[8] = 2.618;
+ * in[9] = 3.142;
+ *
+ * volk_32f_cos_32f(out, in, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("cos(%1.3f) = %1.3f\n", in[ii], out[ii]);
+ * }
+ *
+ * volk_free(in);
+ * volk_free(out);
+ * \endcode
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <inttypes.h>
+
+#ifndef INCLUDED_volk_32f_cos_32f_a_H
+#define INCLUDED_volk_32f_cos_32f_a_H
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+ volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ unsigned int i = 0;
+
+ __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
+ __m128 sine, cosine;
+ __m128i q, ones, twos, fours;
+
+ m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125);
+ pio4A = _mm_set1_ps(0.7853981554508209228515625);
+ pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8);
+ pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+ ffours = _mm_set1_ps(4.0);
+ ftwos = _mm_set1_ps(2.0);
+ fones = _mm_set1_ps(1.0);
+ fzeroes = _mm_setzero_ps();
+ __m128i zeroes = _mm_set1_epi32(0);
+ ones = _mm_set1_epi32(1);
+ __m128i allones = _mm_set1_epi32(0xffffffff);
+ twos = _mm_set1_epi32(2);
+ fours = _mm_set1_epi32(4);
+
+ cp1 = _mm_set1_ps(1.0);
+ cp2 = _mm_set1_ps(0.08333333333333333);
+ cp3 = _mm_set1_ps(0.002777777777777778);
+ cp4 = _mm_set1_ps(4.96031746031746e-05);
+ cp5 = _mm_set1_ps(5.511463844797178e-07);
+ union bit128 condition1;
+ union bit128 condition3;
+
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ // s = fabs(aVal)
+ s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+ // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+ q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+ // r = q + q&1, q indicates quadrant, r gives
+ r = _mm_cvtepi32_ps(_mm_add_epi32(q, _mm_and_si128(q, ones)));
+
+ s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A));
+ s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B));
+ s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C));
+
+ s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+ s = _mm_mul_ps(s, s);
+ // Evaluate Taylor series
+ s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+
+ for(i = 0; i < 3; i++)
+ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+ s = _mm_div_ps(s, ftwos);
+
+ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+ cosine = _mm_sub_ps(fones, s);
+
+ // if(((q+1)&2) != 0) { cosine=sine;}
+ condition1.int_vec = _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes);
+ condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec);
+
+ // if(((q+2)&4) != 0) { cosine = -cosine;}
+ condition3.int_vec = _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes);
+ condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec);
+
+ cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec));
+ cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3.float_vec));
+ _mm_store_ps(bPtr, cosine);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = cos(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+#endif /* INCLUDED_volk_32f_cos_32f_a_H */
+
+
+
+#ifndef INCLUDED_volk_32f_cos_32f_u_H
+#define INCLUDED_volk_32f_cos_32f_u_H
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_cos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ unsigned int i = 0;
+
+ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
+ __m128 sine, cosine, condition1, condition3;
+ __m128i q, r, ones, twos, fours;
+
+ m4pi = _mm_set1_ps(1.273239545);
+ pio4A = _mm_set1_ps(0.78515625);
+ pio4B = _mm_set1_ps(0.241876e-3);
+ ffours = _mm_set1_ps(4.0);
+ ftwos = _mm_set1_ps(2.0);
+ fones = _mm_set1_ps(1.0);
+ fzeroes = _mm_setzero_ps();
+ ones = _mm_set1_epi32(1);
+ twos = _mm_set1_epi32(2);
+ fours = _mm_set1_epi32(4);
+
+ cp1 = _mm_set1_ps(1.0);
+ cp2 = _mm_set1_ps(0.83333333e-1);
+ cp3 = _mm_set1_ps(0.2777778e-2);
+ cp4 = _mm_set1_ps(0.49603e-4);
+ cp5 = _mm_set1_ps(0.551e-6);
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_loadu_ps(aPtr);
+ s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+ q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+ r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+ s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+ s = _mm_mul_ps(s, s);
+ // Evaluate Taylor series
+ s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+
+ for(i = 0; i < 3; i++){
+ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+ }
+ s = _mm_div_ps(s, ftwos);
+
+ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+ cosine = _mm_sub_ps(fones, s);
+
+ condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+
+ condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
+
+ cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
+ cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
+ _mm_storeu_ps(bPtr, cosine);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = cos(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE4_1 for unaligned */
+
+
+#ifdef LV_HAVE_GENERIC
+
+/*
+ * For derivation see
+ * Shibata, Naoki, "Efficient evaluation methods of elementary functions
+ * suitable for SIMD computation," in Springer-Verlag 2010
+ */
+static inline void
+volk_32f_cos_32f_generic_fast(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ float m4pi = 1.273239544735162542821171882678754627704620361328125;
+ float pio4A = 0.7853981554508209228515625;
+ float pio4B = 0.794662735614792836713604629039764404296875e-8;
+ float pio4C = 0.306161699786838294306516483068750264552437361480769e-16;
+ int N = 3; // order of argument reduction
+
+ unsigned int number;
+ for(number = 0; number < num_points; number++){
+ float s = fabs(*aPtr);
+ int q = (int)(s * m4pi);
+ int r = q + (q&1);
+ s -= r * pio4A;
+ s -= r * pio4B;
+ s -= r * pio4C;
+
+ s = s * 0.125; // 2^-N (<--3)
+ s = s*s;
+ s = ((((s/1814400. - 1.0/20160.0)*s + 1.0/360.0)*s - 1.0/12.0)*s + 1.0)*s;
+
+ int i;
+ for(i=0; i < N; ++i) {
+ s = (4.0-s)*s;
+ }
+ s = s/2.0;
+
+ float sine = sqrt((2.0-s)*s);
+ float cosine = 1-s;
+
+ if (((q+1) & 2) != 0) {
+ s = cosine;
+ cosine = sine;
+ sine = s;
+ }
+ if (((q+2) & 4) != 0) {
+ cosine = -cosine;
+ }
+ *bPtr = cosine;
+ bPtr++;
+ aPtr++;
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_cos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(; number < num_points; number++){
+ *bPtr++ = cos(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_cos_32f_u_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_expfast_32f
+ *
+ * \b Overview
+ *
+ * Computes exp of input vector and stores results in output
+ * vector. This uses a fast exp approximation with a maximum 7% error.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_expfast_32f(float* bVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: Input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li bVector: The output vector.
+ *
+ * \b Example
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * in[ii] = std::log((float)ii);
+ * }
+ *
+ * volk_32f_expfast_32f(out, in, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out(%i) = %f\n", ii, out[ii]);
+ * }
+ *
+ * volk_free(in);
+ * volk_free(out);
+ * \endcode
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <inttypes.h>
+
+#define Mln2 0.6931471805f
+#define A 8388608.0f
+#define B 1065353216.0f
+#define C 60801.0f
+
+
+#ifndef INCLUDED_volk_32f_expfast_32f_a_H
+#define INCLUDED_volk_32f_expfast_32f_a_H
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void
+ volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ __m256 aVal, bVal, a, b;
+ __m256i exp;
+ a = _mm256_set1_ps(A/Mln2);
+ b = _mm256_set1_ps(B-C);
+
+ for(;number < eighthPoints; number++){
+ aVal = _mm256_load_ps(aPtr);
+ exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b));
+ bVal = _mm256_castsi256_ps(exp);
+
+ _mm256_store_ps(bPtr, bVal);
+ aPtr += 8;
+ bPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++){
+ *bPtr++ = expf(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_AVX for aligned */
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_expfast_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 aVal, bVal, a, b;
+ __m128i exp;
+ a = _mm_set1_ps(A/Mln2);
+ b = _mm_set1_ps(B-C);
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_load_ps(aPtr);
+ exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b));
+ bVal = _mm_castsi128_ps(exp);
+
+ _mm_store_ps(bPtr, bVal);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = expf(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+#endif /* INCLUDED_volk_32f_expfast_32f_a_H */
+
+
+
+#ifndef INCLUDED_volk_32f_expfast_32f_u_H
+#define INCLUDED_volk_32f_expfast_32f_u_H
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ __m256 aVal, bVal, a, b;
+ __m256i exp;
+ a = _mm256_set1_ps(A/Mln2);
+ b = _mm256_set1_ps(B-C);
+
+ for(;number < eighthPoints; number++){
+ aVal = _mm256_loadu_ps(aPtr);
+ exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b));
+ bVal = _mm256_castsi256_ps(exp);
+
+ _mm256_storeu_ps(bPtr, bVal);
+ aPtr += 8;
+ bPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++){
+ *bPtr++ = expf(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_AVX for aligned */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_expfast_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 aVal, bVal, a, b;
+ __m128i exp;
+ a = _mm_set1_ps(A/Mln2);
+ b = _mm_set1_ps(B-C);
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_loadu_ps(aPtr);
+ exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b));
+ bVal = _mm_castsi128_ps(exp);
+
+ _mm_storeu_ps(bPtr, bVal);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = expf(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE4_1 for unaligned */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_expfast_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *bPtr++ = expf(*aPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_expfast_32f_u_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_index_max_16u
+ *
+ * \b Overview
+ *
+ * Returns Argmax_i x[i]. Finds and returns the index which contains
+ * the maximum value in the given vector.
+ *
+ * Note that num_points is a uint32_t, but the return value is
+ * uint16_t. Providing a vector larger than the max of a uint16_t
+ * (65536) would miss anything outside of this boundary. The kernel
+ * will check the length of num_points and cap it to this max value,
+ * anyways.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_index_max_16u(uint16_t* target, const float* src0, uint32_t num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li src0: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li target: The index of the maximum value in the input buffer.
+ *
+ * \b Example
+ * \code
+ * int N = 10;
+ * uint32_t alignment = volk_get_alignment();
+ * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * uint16_t* out = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment);
+ *
+ * for(uint32_t ii = 0; ii < N; ++ii){
+ * float x = (float)ii;
+ * // a parabola with a maximum at x=4
+ * in[ii] = -(x-4) * (x-4) + 5;
+ * }
+ *
+ * volk_32f_index_max_16u(out, in, N);
+ *
+ * printf("maximum is %1.2f at index %u\n", in[*out], *out);
+ *
+ * volk_free(in);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_index_max_16u_a_H
+#define INCLUDED_volk_32f_index_max_16u_a_H
+
+#include <volk/volk_common.h>
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0,
+ uint32_t num_points)
+{
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+
+ float max = src0[0];
+ float index = 0;
+ __m128 maxValues = _mm_set1_ps(max);
+ __m128 maxValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
+
+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+ for(;number < quarterPoints; number++){
+
+ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
+
+ maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
+ maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
+ }
+
+ // Calculate the largest value from the remaining 4 points
+ _mm_store_ps(maxValuesBuffer, maxValues);
+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for(number = 0; number < 4; number++){
+ if(maxValuesBuffer[number] > max){
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ if(src0[number] > max){
+ index = number;
+ max = src0[number];
+ }
+ }
+ target[0] = (uint16_t)index;
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+
+#ifdef LV_HAVE_SSE
+
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0,
+ uint32_t num_points)
+{
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+
+ float max = src0[0];
+ float index = 0;
+ __m128 maxValues = _mm_set1_ps(max);
+ __m128 maxValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
+
+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+ for(;number < quarterPoints; number++){
+
+ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
+
+ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
+
+ maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
+ }
+
+ // Calculate the largest value from the remaining 4 points
+ _mm_store_ps(maxValuesBuffer, maxValues);
+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for(number = 0; number < 4; number++){
+ if(maxValuesBuffer[number] > max){
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ if(src0[number] > max){
+ index = number;
+ max = src0[number];
+ }
+ }
+ target[0] = (uint16_t)index;
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_index_max_16u_generic(uint16_t* target, const float* src0,
+ uint32_t num_points)
+{
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+ float max = src0[0];
+ uint16_t index = 0;
+
+ uint32_t i = 1;
+
+ for(; i < num_points; ++i) {
+ if(src0[i] > max) {
+ index = i;
+ max = src0[i];
+ }
+ }
+ target[0] = index;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32f_index_max_16u_a_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2016 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_index_max_32u
+ *
+ * \b Overview
+ *
+ * Returns Argmax_i x[i]. Finds and returns the index which contains the maximum value in the given vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_index_max_32u(uint32_t* target, const float* src0, uint32_t num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li src0: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li target: The index of the maximum value in the input buffer.
+ *
+ * \b Example
+ * \code
+ * int N = 10;
+ * uint32_t alignment = volk_get_alignment();
+ * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * uint32_t* out = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment);
+ *
+ * for(uint32_t ii = 0; ii < N; ++ii){
+ * float x = (float)ii;
+ * // a parabola with a maximum at x=4
+ * in[ii] = -(x-4) * (x-4) + 5;
+ * }
+ *
+ * volk_32f_index_max_32u(out, in, N);
+ *
+ * printf("maximum is %1.2f at index %u\n", in[*out], *out);
+ *
+ * volk_free(in);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_index_max_32u_a_H
+#define INCLUDED_volk_32f_index_max_32u_a_H
+
+#include <volk/volk_common.h>
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include<smmintrin.h>
+
+static inline void
+volk_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
+{
+ if(num_points > 0){
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+
+ float max = src0[0];
+ float index = 0;
+ __m128 maxValues = _mm_set1_ps(max);
+ __m128 maxValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
+
+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+ for(;number < quarterPoints; number++){
+
+ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
+
+ maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
+ maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
+ }
+
+ // Calculate the largest value from the remaining 4 points
+ _mm_store_ps(maxValuesBuffer, maxValues);
+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for(number = 0; number < 4; number++){
+ if(maxValuesBuffer[number] > max){
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ if(src0[number] > max){
+ index = number;
+ max = src0[number];
+ }
+ }
+ target[0] = (uint32_t)index;
+ }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+
+#ifdef LV_HAVE_SSE4_1
+#include<smmintrin.h>
+
+static inline void volk_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
+{
+ if(num_points > 0)
+ {
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+
+ float max = src0[0];
+ float index = 0;
+ __m128 maxValues = _mm_set1_ps(max);
+ __m128 maxValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
+
+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+ for(;number < quarterPoints; number++)
+ {
+ currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
+ maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
+ maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
+ }
+
+ // Calculate the largest value from the remaining 4 points
+ _mm_store_ps(maxValuesBuffer, maxValues);
+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for(number = 0; number < 4; number++)
+ {
+ if(maxValuesBuffer[number] > max)
+ {
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++)
+ {
+ if(src0[number] > max)
+ {
+ index = number;
+ max = src0[number];
+ }
+ }
+ target[0] = (uint32_t)index;
+ }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+
+#ifdef LV_HAVE_SSE
+
+#include<xmmintrin.h>
+
+static inline void
+volk_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points)
+{
+ if(num_points > 0){
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+
+ float max = src0[0];
+ float index = 0;
+ __m128 maxValues = _mm_set1_ps(max);
+ __m128 maxValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
+
+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+ for(;number < quarterPoints; number++){
+
+ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
+
+ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
+
+ maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
+ }
+
+ // Calculate the largest value from the remaining 4 points
+ _mm_store_ps(maxValuesBuffer, maxValues);
+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for(number = 0; number < 4; number++){
+ if(maxValuesBuffer[number] > max){
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ if(src0[number] > max){
+ index = number;
+ max = src0[number];
+ }
+ }
+ target[0] = (uint32_t)index;
+ }
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+#ifdef LV_HAVE_SSE
+#include<xmmintrin.h>
+
+static inline void volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points)
+{
+ if(num_points > 0)
+ {
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+
+ float max = src0[0];
+ float index = 0;
+ __m128 maxValues = _mm_set1_ps(max);
+ __m128 maxValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
+
+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+ for(;number < quarterPoints; number++)
+ {
+ currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
+ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
+ maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
+ }
+
+ // Calculate the largest value from the remaining 4 points
+ _mm_store_ps(maxValuesBuffer, maxValues);
+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for(number = 0; number < 4; number++)
+ {
+ if(maxValuesBuffer[number] > max)
+ {
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++)
+ {
+ if(src0[number] > max)
+ {
+ index = number;
+ max = src0[number];
+ }
+ }
+ target[0] = (uint32_t)index;
+ }
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points)
+{
+ if(num_points > 0)
+ {
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 8;
+
+ float* inputPtr = (float*)src0;
+
+ __m256 indexIncrementValues = _mm256_set1_ps(8);
+ __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
+
+ float max = src0[0];
+ float index = 0;
+ __m256 maxValues = _mm256_set1_ps(max);
+ __m256 maxValuesIndex = _mm256_setzero_ps();
+ __m256 compareResults;
+ __m256 currentValues;
+
+ __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
+ __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
+
+ for(;number < quarterPoints; number++)
+ {
+ currentValues = _mm256_load_ps(inputPtr); inputPtr += 8;
+ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+ compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e);
+ maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
+ maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults);
+ }
+
+ // Calculate the largest value from the remaining 8 points
+ _mm256_store_ps(maxValuesBuffer, maxValues);
+ _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for(number = 0; number < 8; number++)
+ {
+ if(maxValuesBuffer[number] > max)
+ {
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 8;
+ for(;number < num_points; number++)
+ {
+ if(src0[number] > max)
+ {
+ index = number;
+ max = src0[number];
+ }
+ }
+ target[0] = (uint32_t)index;
+ }
+}
+
+#endif /*LV_HAVE_AVX*/
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points)
+{
+ if(num_points > 0)
+ {
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 8;
+
+ float* inputPtr = (float*)src0;
+
+ __m256 indexIncrementValues = _mm256_set1_ps(8);
+ __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
+
+ float max = src0[0];
+ float index = 0;
+ __m256 maxValues = _mm256_set1_ps(max);
+ __m256 maxValuesIndex = _mm256_setzero_ps();
+ __m256 compareResults;
+ __m256 currentValues;
+
+ __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
+ __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
+
+ for(;number < quarterPoints; number++)
+ {
+ currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8;
+ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+ compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e);
+ maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
+ maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults);
+ }
+
+ // Calculate the largest value from the remaining 8 points
+ _mm256_store_ps(maxValuesBuffer, maxValues);
+ _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for(number = 0; number < 8; number++)
+ {
+ if(maxValuesBuffer[number] > max)
+ {
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 8;
+ for(;number < num_points; number++)
+ {
+ if(src0[number] > max)
+ {
+ index = number;
+ max = src0[number];
+ }
+ }
+ target[0] = (uint32_t)index;
+ }
+}
+
+#endif /*LV_HAVE_AVX*/
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points)
+{
+ if(num_points > 0)
+ {
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+ float32x4_t indexIncrementValues = vdupq_n_f32(4);
+ __VOLK_ATTR_ALIGNED(16) float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
+ float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
+
+ float max = src0[0];
+ float index = 0;
+ float32x4_t maxValues = vdupq_n_f32(max);
+ uint32x4_t maxValuesIndex = vmovq_n_u32(0);
+ uint32x4_t compareResults;
+ uint32x4_t currentIndexes_u;
+ float32x4_t currentValues;
+
+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+ for(;number < quarterPoints; number++)
+ {
+ currentValues = vld1q_f32(inputPtr); inputPtr += 4;
+ currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
+ currentIndexes_u = vcvtq_u32_f32(currentIndexes);
+ compareResults = vcgtq_f32( maxValues, currentValues);
+ maxValuesIndex = vorrq_u32( vandq_u32( compareResults, maxValuesIndex ), vbicq_u32(currentIndexes_u, compareResults) );
+ maxValues = vmaxq_f32(currentValues, maxValues);
+ }
+
+ // Calculate the largest value from the remaining 4 points
+ vst1q_f32(maxValuesBuffer, maxValues);
+ vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex));
+ for(number = 0; number < 4; number++)
+ {
+ if(maxValuesBuffer[number] > max)
+ {
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++)
+ {
+ if(src0[number] > max)
+ {
+ index = number;
+ max = src0[number];
+ }
+ }
+ target[0] = (uint32_t)index;
+ }
+}
+
+#endif /*LV_HAVE_NEON*/
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points)
+{
+ if(num_points > 0){
+ float max = src0[0];
+ uint32_t index = 0;
+
+ uint32_t i = 1;
+
+ for(; i < num_points; ++i) {
+ if(src0[i] > max){
+ index = i;
+ max = src0[i];
+ }
+ }
+ target[0] = index;
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32f_index_max_32u_a_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2013, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_invsqrt_32f
+ *
+ * \b Overview
+ *
+ * Computes the inverse square root of the input vector and stores
+ * result in the output vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_invsqrt_32f(float* cVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: the input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * in[ii] = 1.0 / (float)(ii*ii);
+ * }
+ *
+ * volk_32f_invsqrt_32f(out, in, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out(%i) = %f\n", ii, out[ii]);
+ * }
+ *
+ * volk_free(in);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_invsqrt_32f_a_H
+#define INCLUDED_volk_32f_invsqrt_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
+
+static inline float
+Q_rsqrt(float number)
+{
+ float x2;
+ const float threehalfs = 1.5F;
+ union f32_to_i32 {
+ int32_t i;
+ float f;
+ } u;
+
+ x2 = number * 0.5F;
+ u.f = number;
+ u.i = 0x5f3759df - ( u.i >> 1 ); // what the fuck?
+ u.f = u.f * ( threehalfs - ( x2 * u.f * u.f ) ); // 1st iteration
+ //u.f = u.f * ( threehalfs - ( x2 * u.f * u.f ) ); // 2nd iteration, this can be removed
+
+ return u.f;
+}
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_invsqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ __m256 aVal, cVal;
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_load_ps(aPtr);
+ cVal = _mm256_rsqrt_ps(aVal);
+ _mm256_store_ps(cPtr, cVal);
+ aPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++)
+ *cPtr++ = Q_rsqrt(*aPtr++);
+
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_invsqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m128 aVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+
+ cVal = _mm_rsqrt_ps(aVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++) {
+ *cPtr++ = Q_rsqrt(*aPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32f_invsqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
+{
+ unsigned int number;
+ const unsigned int quarter_points = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ float32x4_t a_val, c_val;
+ for (number = 0; number < quarter_points; ++number) {
+ a_val = vld1q_f32(aPtr);
+ c_val = vrsqrteq_f32(a_val);
+ vst1q_f32(cPtr, c_val);
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ for(number=quarter_points * 4;number < num_points; number++)
+ *cPtr++ = Q_rsqrt(*aPtr++);
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_invsqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
+{
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++) {
+ *cPtr++ = Q_rsqrt(*aPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_invsqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ __m256 aVal, cVal;
+ for (; number < eighthPoints; number++) {
+ aVal = _mm256_loadu_ps(aPtr);
+ cVal = _mm256_rsqrt_ps(aVal);
+ _mm256_storeu_ps(cPtr, cVal);
+ aPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++)
+ *cPtr++ = Q_rsqrt(*aPtr++);
+
+}
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_32f_invsqrt_32f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_log2_32f
+ *
+ * \b Overview
+ *
+ * Computes base 2 log of input vector and stores results in output vector.
+ *
+ * This kernel was adapted from Jose Fonseca's Fast SSE2 log implementation
+ * http://jrfonseca.blogspot.in/2008/09/fast-sse2-pow-tables-or-polynomials.htm
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * This is the MIT License (MIT)
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_log2_32f(float* bVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: the input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li bVector: The output vector.
+ *
+ * \b Example
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * in[ii] = std::pow(2.f,((float)ii));
+ * }
+ *
+ * volk_32f_log2_32f(out, in, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out(%i) = %f\n", ii, out[ii]);
+ * }
+ *
+ * volk_free(in);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_log2_32f_a_H
+#define INCLUDED_volk_32f_log2_32f_a_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <math.h>
+
+#define POLY0(x, c0) _mm_set1_ps(c0)
+#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+#define LOG_POLY_DEGREE 6
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++) {
+ *bPtr++ = log2f(*aPtr++);
+ }
+
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 aVal, bVal, mantissa, frac, leadingOne;
+ __m128i bias, exp;
+
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bias = _mm_set1_epi32(127);
+ leadingOne = _mm_set1_ps(1.0f);
+ exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
+ bVal = _mm_cvtepi32_ps(exp);
+
+ // Now to extract mantissa
+ frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+
+#if LOG_POLY_DEGREE == 6
+ mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+#elif LOG_POLY_DEGREE == 5
+ mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+#elif LOG_POLY_DEGREE == 4
+ mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+#elif LOG_POLY_DEGREE == 3
+ mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+#else
+#error
+#endif
+
+ bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
+ _mm_store_ps(bPtr, bVal);
+
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = log2f(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+/* these macros allow us to embed logs in other kernels */
+#define VLOG2Q_NEON_PREAMBLE() \
+ int32x4_t one = vdupq_n_s32(0x000800000); \
+ /* minimax polynomial */ \
+ float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
+ float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
+ float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
+ float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
+ float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
+ float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
+ float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
+ int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
+ int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
+ int32x4_t exp_bias = vdupq_n_s32(127);
+
+
+#define VLOG2Q_NEON_F32(log2_approx, aval) \
+ int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
+ int32x4_t significand_i = vandq_s32(aval, sig_mask); \
+ exponent_i = vshrq_n_s32(exponent_i, 23); \
+ \
+ /* extract the exponent and significand \
+ we can treat this as fixed point to save ~9% on the \
+ conversion + float add */ \
+ significand_i = vorrq_s32(one, significand_i); \
+ float32x4_t significand_f = vcvtq_n_f32_s32(significand_i,23); \
+ /* debias the exponent and convert to float */ \
+ exponent_i = vsubq_s32(exponent_i, exp_bias); \
+ float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
+ \
+ /* put the significand through a polynomial fit of log2(x) [1,2] \
+ add the result to the exponent */ \
+ log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \
+ float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \
+ tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ \
+ float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \
+ tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \
+ tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \
+ tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \
+ tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1);
+
+static inline void
+volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number;
+ const unsigned int quarterPoints = num_points / 4;
+
+ int32x4_t aval;
+ float32x4_t log2_approx;
+
+ VLOG2Q_NEON_PREAMBLE()
+ // lms
+ //p0 = vdupq_n_f32(-1.649132280361871);
+ //p1 = vdupq_n_f32(1.995047138579499);
+ //p2 = vdupq_n_f32(-0.336914839219728);
+
+ // keep in mind a single precision float is represented as
+ // (-1)^sign * 2^exp * 1.significand, so the log2 is
+ // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
+ for(number = 0; number < quarterPoints; ++number){
+ // load float in to an int register without conversion
+ aval = vld1q_s32((int*)aPtr);
+
+ VLOG2Q_NEON_F32(log2_approx, aval)
+
+ vst1q_f32(bPtr, log2_approx);
+
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ for(number = quarterPoints * 4; number < num_points; number++){
+ *bPtr++ = log2f(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_NEON */
+
+
+#endif /* INCLUDED_volk_32f_log2_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_log2_32f_u_H
+#define INCLUDED_volk_32f_log2_32f_u_H
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *bPtr++ = log2f(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 aVal, bVal, mantissa, frac, leadingOne;
+ __m128i bias, exp;
+
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_loadu_ps(aPtr);
+ bias = _mm_set1_epi32(127);
+ leadingOne = _mm_set1_ps(1.0f);
+ exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
+ bVal = _mm_cvtepi32_ps(exp);
+
+ // Now to extract mantissa
+ frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+
+#if LOG_POLY_DEGREE == 6
+ mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+#elif LOG_POLY_DEGREE == 5
+ mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+#elif LOG_POLY_DEGREE == 4
+ mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+#elif LOG_POLY_DEGREE == 3
+ mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+#else
+#error
+#endif
+
+ bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
+ _mm_storeu_ps(bPtr, bVal);
+
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = log2f(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE4_1 for unaligned */
+
+#endif /* INCLUDED_volk_32f_log2_32f_u_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <inttypes.h>
+
+#ifndef INCLUDED_volk_32f_null_32f_a_H
+#define INCLUDED_volk_32f_null_32f_a_H
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_null_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number;
+
+ for(number = 0; number < num_points; number++){
+ *bPtr++ = *aPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_null_32f_u_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_s32f_32f_fm_detect_32f
+ *
+ * \b Overview
+ *
+ * Performs FM-detect differentiation on the input vector and stores
+ * the results in the output vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_s32f_32f_fm_detect_32f(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The input vector containing phase data (must be on the interval (-bound, bound]).
+ * \li bound: The interval that the input phase data is in, which is used to modulo the differentiation.
+ * \li saveValue: A pointer to a float which contains the phase value of the sample before the first input sample.
+ * \li num_points The number of data points.
+ *
+ * \b Outputs
+ * \li outputVector: The vector where the results will be stored.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * <FIXME>
+ *
+ * volk_32f_s32f_32f_fm_detect_32f();
+ *
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H
+#define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
+ if (num_points < 1) {
+ return;
+ }
+ unsigned int number = 1;
+ unsigned int j = 0;
+ // num_points-1 keeps Fedora 7's gcc from crashing...
+ // num_points won't work. :(
+ const unsigned int quarterPoints = (num_points-1) / 4;
+
+ float* outPtr = outputVector;
+ const float* inPtr = inputVector;
+ __m128 upperBound = _mm_set_ps1(bound);
+ __m128 lowerBound = _mm_set_ps1(-bound);
+ __m128 next3old1;
+ __m128 next4;
+ __m128 boundAdjust;
+ __m128 posBoundAdjust = _mm_set_ps1(-2*bound); // Subtract when we're above.
+ __m128 negBoundAdjust = _mm_set_ps1(2*bound); // Add when we're below.
+ // Do the first 4 by hand since we're going in from the saveValue:
+ *outPtr = *inPtr - *saveValue;
+ if (*outPtr > bound) *outPtr -= 2*bound;
+ if (*outPtr < -bound) *outPtr += 2*bound;
+ inPtr++;
+ outPtr++;
+ for (j = 1; j < ( (4 < num_points) ? 4 : num_points); j++) {
+ *outPtr = *(inPtr) - *(inPtr-1);
+ if (*outPtr > bound) *outPtr -= 2*bound;
+ if (*outPtr < -bound) *outPtr += 2*bound;
+ inPtr++;
+ outPtr++;
+ }
+
+ for (; number < quarterPoints; number++) {
+ // Load data
+ next3old1 = _mm_loadu_ps((float*) (inPtr-1));
+ next4 = _mm_load_ps(inPtr);
+ inPtr += 4;
+ // Subtract and store:
+ next3old1 = _mm_sub_ps(next4, next3old1);
+ // Bound:
+ boundAdjust = _mm_cmpgt_ps(next3old1, upperBound);
+ boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust);
+ next4 = _mm_cmplt_ps(next3old1, lowerBound);
+ next4 = _mm_and_ps(next4, negBoundAdjust);
+ boundAdjust = _mm_or_ps(next4, boundAdjust);
+ // Make sure we're in the bounding interval:
+ next3old1 = _mm_add_ps(next3old1, boundAdjust);
+ _mm_store_ps(outPtr,next3old1); // Store the results back into the output
+ outPtr += 4;
+ }
+
+ for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) {
+ *outPtr = *(inPtr) - *(inPtr-1);
+ if (*outPtr > bound) *outPtr -= 2*bound;
+ if (*outPtr < -bound) *outPtr += 2*bound;
+ inPtr++;
+ outPtr++;
+ }
+
+ *saveValue = inputVector[num_points-1];
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
+ if (num_points < 1) {
+ return;
+ }
+ unsigned int number = 0;
+ float* outPtr = outputVector;
+ const float* inPtr = inputVector;
+
+ // Do the first 1 by hand since we're going in from the saveValue:
+ *outPtr = *inPtr - *saveValue;
+ if (*outPtr > bound) *outPtr -= 2*bound;
+ if (*outPtr < -bound) *outPtr += 2*bound;
+ inPtr++;
+ outPtr++;
+
+ for (number = 1; number < num_points; number++) {
+ *outPtr = *(inPtr) - *(inPtr-1);
+ if (*outPtr > bound) *outPtr -= 2*bound;
+ if (*outPtr < -bound) *outPtr += 2*bound;
+ inPtr++;
+ outPtr++;
+ }
+
+ *saveValue = inputVector[num_points-1];
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_s32f_calc_spectral_noise_floor_32f
+ *
+ * \b Overview
+ *
+ * Computes the spectral noise floor of an input power spectrum.
+ *
+ * Calculates the spectral noise floor of an input power spectrum by
+ * determining the mean of the input power spectrum, then
+ * recalculating the mean excluding any power spectrum values that
+ * exceed the mean by the spectralExclusionValue (in dB). Provides a
+ * rough estimation of the signal noise floor.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_s32f_calc_spectral_noise_floor_32f(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li realDataPoints: The input power spectrum.
+ * \li spectralExclusionValue: The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li noiseFloorAmplitude: The noise floor of the input spectrum, in dB.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_32f_s32f_calc_spectral_noise_floor_32f
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H
+#define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_s32f_calc_spectral_noise_floor_32f_a_sse(float* noiseFloorAmplitude,
+ const float* realDataPoints,
+ const float spectralExclusionValue,
+ const unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* dataPointsPtr = realDataPoints;
+ __VOLK_ATTR_ALIGNED(16) float avgPointsVector[4];
+
+ __m128 dataPointsVal;
+ __m128 avgPointsVal = _mm_setzero_ps();
+ // Calculate the sum (for mean) for all points
+ for(; number < quarterPoints; number++){
+
+ dataPointsVal = _mm_load_ps(dataPointsPtr);
+
+ dataPointsPtr += 4;
+
+ avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal);
+ }
+
+ _mm_store_ps(avgPointsVector, avgPointsVal);
+
+ float sumMean = 0.0;
+ sumMean += avgPointsVector[0];
+ sumMean += avgPointsVector[1];
+ sumMean += avgPointsVector[2];
+ sumMean += avgPointsVector[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ sumMean += realDataPoints[number];
+ }
+
+ // calculate the spectral mean
+ // +20 because for the comparison below we only want to throw out bins
+ // that are significantly higher (and would, thus, affect the mean more
+ const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
+
+ dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
+ __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude);
+ __m128 vOnesVector = _mm_set_ps1(1.0);
+ __m128 vValidBinCount = _mm_setzero_ps();
+ avgPointsVal = _mm_setzero_ps();
+ __m128 compareMask;
+ number = 0;
+ // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
+ for(; number < quarterPoints; number++){
+
+ dataPointsVal = _mm_load_ps(dataPointsPtr);
+
+ dataPointsPtr += 4;
+
+ // Identify which items do not exceed the mean amplitude
+ compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector);
+
+ // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude
+ avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal));
+
+ // Count the number of bins which do not exceed the mean amplitude
+ vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector));
+ }
+
+ // Calculate the mean from the remaining data points
+ _mm_store_ps(avgPointsVector, avgPointsVal);
+
+ sumMean = 0.0;
+ sumMean += avgPointsVector[0];
+ sumMean += avgPointsVector[1];
+ sumMean += avgPointsVector[2];
+ sumMean += avgPointsVector[3];
+
+ // Calculate the number of valid bins from the remaning count
+ __VOLK_ATTR_ALIGNED(16) float validBinCountVector[4];
+ _mm_store_ps(validBinCountVector, vValidBinCount);
+
+ float validBinCount = 0;
+ validBinCount += validBinCountVector[0];
+ validBinCount += validBinCountVector[1];
+ validBinCount += validBinCountVector[2];
+ validBinCount += validBinCountVector[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ if(realDataPoints[number] <= meanAmplitude){
+ sumMean += realDataPoints[number];
+ validBinCount += 1.0;
+ }
+ }
+
+ float localNoiseFloorAmplitude = 0;
+ if(validBinCount > 0.0){
+ localNoiseFloorAmplitude = sumMean / validBinCount;
+ }
+ else{
+ localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal...
+ }
+
+ *noiseFloorAmplitude = localNoiseFloorAmplitude;
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_s32f_calc_spectral_noise_floor_32f_generic(float* noiseFloorAmplitude,
+ const float* realDataPoints,
+ const float spectralExclusionValue,
+ const unsigned int num_points)
+{
+ float sumMean = 0.0;
+ unsigned int number;
+ // find the sum (for mean), etc
+ for(number = 0; number < num_points; number++){
+ // sum (for mean)
+ sumMean += realDataPoints[number];
+ }
+
+ // calculate the spectral mean
+ // +20 because for the comparison below we only want to throw out bins
+ // that are significantly higher (and would, thus, affect the mean more)
+ const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue;
+
+ // now throw out any bins higher than the mean
+ sumMean = 0.0;
+ unsigned int newNumDataPoints = num_points;
+ for(number = 0; number < num_points; number++){
+ if (realDataPoints[number] <= meanAmplitude)
+ sumMean += realDataPoints[number];
+ else
+ newNumDataPoints--;
+ }
+
+ float localNoiseFloorAmplitude = 0.0;
+ if (newNumDataPoints == 0) // in the odd case that all
+ localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal!
+ else
+ localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints);
+
+ *noiseFloorAmplitude = localNoiseFloorAmplitude;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_s32f_convert_16i
+ *
+ * \b Overview
+ *
+ * Converts a floating point number to a 16-bit short after applying a
+ * scaling factor.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_s32f_convert_16i(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li inputVector: the input vector of floats.
+ * \li scalar: The value multiplied against each point in the input buffer.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li outputVector: The output vector.
+ *
+ * \b Example
+ * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest delta
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * int16_t* out = (int16_t*)volk_malloc(sizeof(int16_t)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
+ * }
+ *
+ * // Normalize by the smallest delta (0.2 in this example)
+ * float scale = 5.f;
+ *
+ * volk_32f_s32f_convert_32i(out, increasing, scale, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out[%u] = %i\n", ii, out[ii]);
+ * }
+ *
+ * volk_free(increasing);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_s32f_convert_16i_u_H
+#define INCLUDED_volk_32f_s32f_convert_16i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector, const float* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ __m256 vScalar = _mm256_set1_ps(scalar);
+ __m256 inputVal, ret;
+ __m256i intInputVal;
+ __m128i intInputVal1, intInputVal2;
+ __m256 vmin_val = _mm256_set1_ps(min_val);
+ __m256 vmax_val = _mm256_set1_ps(max_val);
+
+ for(;number < eighthPoints; number++){
+ inputVal = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
+
+ // Scale and clip
+ ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), vmin_val);
+
+ intInputVal = _mm256_cvtps_epi32(ret);
+
+ intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
+ intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void
+volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1, inputVal2;
+ __m128i intInputVal1, intInputVal2;
+ __m128 ret1, ret2;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(;number < eighthPoints; number++){
+ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ // Scale and clip
+ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ // Scale and clip
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ int16_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ for(number = 0; number < num_points; number++){
+ r = *inputVectorPtr++ * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ *outputVectorPtr++ = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_16i_u_H */
+#ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H
+#define INCLUDED_volk_32f_s32f_convert_16i_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_s32f_convert_16i_a_avx(int16_t* outputVector, const float* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ __m256 vScalar = _mm256_set1_ps(scalar);
+ __m256 inputVal, ret;
+ __m256i intInputVal;
+ __m128i intInputVal1, intInputVal2;
+ __m256 vmin_val = _mm256_set1_ps(min_val);
+ __m256 vmax_val = _mm256_set1_ps(max_val);
+
+ for(;number < eighthPoints; number++){
+ inputVal = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
+
+ // Scale and clip
+ ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), vmin_val);
+
+ intInputVal = _mm256_cvtps_epi32(ret);
+
+ intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
+ intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void
+volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1, inputVal2;
+ __m128i intInputVal1, intInputVal2;
+ __m128 ret1, ret2;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(;number < eighthPoints; number++){
+ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ // Scale and clip
+ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ // Scale and clip
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ int16_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ for(number = 0; number < num_points; number++){
+ r = *inputVectorPtr++ * scalar;
+ if(r < min_val)
+ r = min_val;
+ else if(r > max_val)
+ r = max_val;
+ *outputVectorPtr++ = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_s32f_convert_32i
+ *
+ * \b Overview
+ *
+ * Converts a floating point number to a 32-bit integer after applying a
+ * scaling factor.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_s32f_convert_32i(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li inputVector: the input vector of floats.
+ * \li scalar: The value multiplied against each point in the input buffer.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li outputVector: The output vector.
+ *
+ * \b Example
+ * Convert floats from [-1,1] to integers with a scale of 5 to maintain smallest delta
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * int32_t* out = (int32_t*)volk_malloc(sizeof(int32_t)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
+ * }
+ *
+ * // Normalize by the smallest delta (0.2 in this example)
+ * float scale = 5.f;
+ *
+ * volk_32f_s32f_convert_32i(out, increasing, scale, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out[%u] = %i\n", ii, out[ii]);
+ * }
+ *
+ * volk_free(increasing);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_s32f_convert_32i_u_H
+#define INCLUDED_volk_32f_s32f_convert_32i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void
+volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+
+ float min_val = -2147483647;
+ float max_val = 2147483647;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1;
+ __m128i intInputVal1;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(;number < quarterPoints; number++){
+ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ intInputVal1 = _mm_cvtps_epi32(inputVal1);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int32_t)(r);
+ }
+}
+
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+
+ float min_val = -2147483647;
+ float max_val = 2147483647;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int32_t)(r);
+ }
+}
+
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_s32f_convert_32i_generic(int32_t* outputVector, const float* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ int32_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = -2147483647;
+ float max_val = 2147483647;
+ float r;
+
+ for(number = 0; number < num_points; number++){
+ r = *inputVectorPtr++ * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ *outputVectorPtr++ = (int32_t)(r);
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */
+#ifndef INCLUDED_volk_32f_s32f_convert_32i_a_H
+#define INCLUDED_volk_32f_s32f_convert_32i_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const float* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+
+ float min_val = -2147483647;
+ float max_val = 2147483647;
+ float r;
+
+ __m256 vScalar = _mm256_set1_ps(scalar);
+ __m256 inputVal1;
+ __m256i intInputVal1;
+ __m256 vmin_val = _mm256_set1_ps(min_val);
+ __m256 vmax_val = _mm256_set1_ps(max_val);
+
+ for(;number < eighthPoints; number++){
+ inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
+
+ inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ intInputVal1 = _mm256_cvtps_epi32(inputVal1);
+
+ _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int32_t)(r);
+ }
+}
+
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void
+volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const float* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+
+ float min_val = -2147483647;
+ float max_val = 2147483647;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1;
+ __m128i intInputVal1;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(;number < quarterPoints; number++){
+ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ intInputVal1 = _mm_cvtps_epi32(inputVal1);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int32_t)(r);
+ }
+}
+
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const float* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+
+ float min_val = -2147483647;
+ float max_val = 2147483647;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int32_t)(r);
+ }
+}
+
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector, const float* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ int32_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = -2147483647;
+ float max_val = 2147483647;
+ float r;
+
+ for(number = 0; number < num_points; number++){
+ r = *inputVectorPtr++ * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ *outputVectorPtr++ = (int32_t)(r);
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_s32f_convert_32i_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_s32f_convert_8i
+ *
+ * \b Overview
+ *
+ * Converts a floating point number to a 8-bit char after applying a
+ * scaling factor.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_s32f_convert_8i(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li inputVector: the input vector of floats.
+ * \li scalar: The value multiplied against each point in the input buffer.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li outputVector: The output vector.
+ *
+ * \b Example
+ * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest delta
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * int16_t* out = (int16_t*)volk_malloc(sizeof(int16_t)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
+ * }
+ *
+ * // Normalize by the smallest delta (0.2 in this example)
+ * // With float -> 8 bit ints be careful of scaling
+
+ * float scale = 5.1f;
+ *
+ * volk_32f_s32f_convert_32i(out, increasing, scale, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out[%u] = %i\n", ii, out[ii]);
+ * }
+ *
+ * volk_free(increasing);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H
+#define INCLUDED_volk_32f_s32f_convert_8i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void
+volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const float* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int8_t* outputVectorPtr = outputVector;
+
+ float min_val = -128;
+ float max_val = 127;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(;number < sixteenthPoints; number++){
+ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+ inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+ inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(inputVal1);
+ intInputVal2 = _mm_cvtps_epi32(inputVal2);
+ intInputVal3 = _mm_cvtps_epi32(inputVal3);
+ intInputVal4 = _mm_cvtps_epi32(inputVal4);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+ intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+
+ intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)(r);
+ }
+}
+
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const float* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int8_t* outputVectorPtr = outputVector;
+
+ float min_val = -128;
+ float max_val = 127;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)(r);
+ }
+}
+
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_s32f_convert_8i_generic(int8_t* outputVector, const float* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ int8_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = -128;
+ float max_val = 127;
+ float r;
+
+ for(number = 0; number < num_points; number++){
+ r = *inputVectorPtr++ * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ *outputVectorPtr++ = (int16_t)(r);
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_8i_u_H */
+#ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
+#define INCLUDED_volk_32f_s32f_convert_8i_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void
+volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const float* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int8_t* outputVectorPtr = outputVector;
+
+ float min_val = -128;
+ float max_val = 127;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(;number < sixteenthPoints; number++){
+ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+ inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+ inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(inputVal1);
+ intInputVal2 = _mm_cvtps_epi32(inputVal2);
+ intInputVal3 = _mm_cvtps_epi32(inputVal3);
+ intInputVal4 = _mm_cvtps_epi32(inputVal4);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+ intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+
+ intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int8_t)(r);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const float* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+
+ float min_val = -128;
+ float max_val = 127;
+ float r;
+
+ int8_t* outputVectorPtr = outputVector;
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int8_t)(r);
+ }
+}
+
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, const float* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ int8_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = -128;
+ float max_val = 127;
+ float r;
+
+ for(number = 0; number < num_points; number++){
+ r = *inputVectorPtr++ * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ *outputVectorPtr++ = (int8_t)(r);
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_s32f_multiply_32f
+ *
+ * \b Overview
+ *
+ * Multiplies a floating point vector by a floating point scalar.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_s32f_multiply_32f(float* cVector, const float* aVector, const float scalar, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+ * \li scalar: the scalar value to multiply against \p aVector.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li cVector: The output vector of floats.
+ *
+ * \b Example
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
+ * }
+ *
+ * // Normalize by the smallest delta (0.2 in this example)
+ * float scale = 5.0f;
+ *
+ * volk_32f_s32f_multiply_32f(out, increasing, scale, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out[%u] = %f\n", ii, out[ii]);
+ * }
+ *
+ * volk_free(increasing);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H
+#define INCLUDED_volk_32f_s32f_multiply_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float* aVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m128 aVal, bVal, cVal;
+ bVal = _mm_set_ps1(scalar);
+ for(;number < quarterPoints; number++){
+ aVal = _mm_loadu_ps(aPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
+ _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float* aVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m256 aVal, bVal, cVal;
+ bVal = _mm256_set1_ps(scalar);
+ for(;number < eighthPoints; number++){
+
+ aVal = _mm256_loadu_ps(aPtr);
+
+ cVal = _mm256_mul_ps(aVal, bVal);
+
+ _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * scalar;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_s32f_multiply_32f_generic(float* cVector, const float* aVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const float* inputPtr = aVector;
+ float* outputPtr = cVector;
+ for(number = 0; number < num_points; number++){
+ *outputPtr = (*inputPtr) * scalar;
+ inputPtr++;
+ outputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */
+
+
+#ifndef INCLUDED_volk_32f_s32f_multiply_32f_a_H
+#define INCLUDED_volk_32f_s32f_multiply_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_s32f_multiply_32f_a_sse(float* cVector, const float* aVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m128 aVal, bVal, cVal;
+ bVal = _mm_set_ps1(scalar);
+ for(;number < quarterPoints; number++){
+ aVal = _mm_load_ps(aPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_s32f_multiply_32f_a_avx(float* cVector, const float* aVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m256 aVal, bVal, cVal;
+ bVal = _mm256_set1_ps(scalar);
+ for(;number < eighthPoints; number++){
+ aVal = _mm256_load_ps(aPtr);
+
+ cVal = _mm256_mul_ps(aVal, bVal);
+
+ _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * scalar;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32f_s32f_multiply_32f_u_neon(float* cVector, const float* aVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const float* inputPtr = aVector;
+ float* outputPtr = cVector;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float32x4_t aVal, cVal;
+
+ for(number = 0; number < quarterPoints; number++){
+ aVal = vld1q_f32(inputPtr); // Load into NEON regs
+ cVal = vmulq_n_f32 (aVal, scalar); // Do the multiply
+ vst1q_f32(outputPtr, cVal); // Store results back to output
+ inputPtr += 4;
+ outputPtr += 4;
+ }
+ for(number = quarterPoints * 4; number < num_points; number++){
+ *outputPtr++ = (*inputPtr++) * scalar;
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_s32f_multiply_32f_a_generic(float* cVector, const float* aVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const float* inputPtr = aVector;
+ float* outputPtr = cVector;
+ for(number = 0; number < num_points; number++){
+ *outputPtr = (*inputPtr) * scalar;
+ inputPtr++;
+ outputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_ORC
+
+extern void
+volk_32f_s32f_multiply_32f_a_orc_impl(float* dst, const float* src,
+ const float scalar, unsigned int num_points);
+
+static inline void
+volk_32f_s32f_multiply_32f_u_orc(float* cVector, const float* aVector,
+ const float scalar, unsigned int num_points)
+{
+ volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_s32f_multiply_32f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_s32f_normalize
+ *
+ * \b Overview
+ *
+ * Normalizes all points in the buffer by the scalar value (divides
+ * each data point by the scalar value).
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_s32f_normalize(float* vecBuffer, const float scalar, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li vecBuffer: The buffer of values to be vectorized.
+ * \li scalar: The scale value to be applied to each buffer value.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li vecBuffer: returns as an in-place calculation.
+ *
+ * \b Example
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
+ * }
+ *
+ * // Normalize by the smallest delta (0.2 in this example)
+ * float scale = 5.0f;
+ *
+ * volk_32f_s32f_normalize(increasing, scale, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("increasing[%u] = %f\n", ii, increasing[ii]);
+ * }
+ *
+ * volk_free(increasing);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_s32f_normalize_a_H
+#define INCLUDED_volk_32f_s32f_normalize_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_s32f_normalize_a_sse(float* vecBuffer, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ float* inputPtr = vecBuffer;
+
+ const float invScalar = 1.0 / scalar;
+ __m128 vecScalar = _mm_set_ps1(invScalar);
+
+ __m128 input1;
+
+ const uint64_t quarterPoints = num_points / 4;
+ for(;number < quarterPoints; number++){
+
+ input1 = _mm_load_ps(inputPtr);
+
+ input1 = _mm_mul_ps(input1, vecScalar);
+
+ _mm_store_ps(inputPtr, input1);
+
+ inputPtr += 4;
+ }
+
+ number = quarterPoints*4;
+ for(; number < num_points; number++){
+ *inputPtr *= invScalar;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_s32f_normalize_a_avx(float* vecBuffer, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ float* inputPtr = vecBuffer;
+
+ const float invScalar = 1.0 / scalar;
+ __m256 vecScalar = _mm256_set1_ps(invScalar);
+ __m256 input1;
+
+ const uint64_t eigthPoints = num_points / 8;
+ for(;number < eigthPoints; number++){
+
+ input1 = _mm256_load_ps(inputPtr);
+
+ input1 = _mm256_mul_ps(input1, vecScalar);
+
+ _mm256_store_ps(inputPtr, input1);
+
+ inputPtr += 8;
+ }
+
+ number = eigthPoints*8;
+ for(; number < num_points; number++){
+ *inputPtr *= invScalar;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_s32f_normalize_generic(float* vecBuffer, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ float* inputPtr = vecBuffer;
+ const float invScalar = 1.0 / scalar;
+ for(number = 0; number < num_points; number++){
+ *inputPtr *= invScalar;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+
+extern void volk_32f_s32f_normalize_a_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points);
+static inline void volk_32f_s32f_normalize_u_orc(float* vecBuffer, const float scalar, unsigned int num_points){
+ float invscalar = 1.0 / scalar;
+ volk_32f_s32f_normalize_a_orc_impl(vecBuffer, vecBuffer, invscalar, num_points);
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_s32f_normalize_a_H */
+
+
+#ifndef INCLUDED_volk_32f_s32f_normalize_u_H
+#define INCLUDED_volk_32f_s32f_normalize_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_s32f_normalize_u_avx(float* vecBuffer, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ float* inputPtr = vecBuffer;
+
+ const float invScalar = 1.0 / scalar;
+ __m256 vecScalar = _mm256_set1_ps(invScalar);
+ __m256 input1;
+
+ const uint64_t eigthPoints = num_points / 8;
+ for(;number < eigthPoints; number++){
+
+ input1 = _mm256_loadu_ps(inputPtr);
+
+ input1 = _mm256_mul_ps(input1, vecScalar);
+
+ _mm256_storeu_ps(inputPtr, input1);
+
+ inputPtr += 8;
+ }
+
+ number = eigthPoints*8;
+ for(; number < num_points; number++){
+ *inputPtr *= invScalar;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#endif /* INCLUDED_volk_32f_s32f_normalize_u_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_s32f_power_32f
+ *
+ * \b Overview
+ *
+ * Takes each input vector value to the specified power and stores the
+ * results in the return vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_s32f_power_32f(float* cVector, const float* aVector, const float power, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+ * \li power: The power to raise the input value to.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * Square the numbers (0,9)
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * increasing[ii] = (float)ii;
+ * }
+ *
+ * // Normalize by the smallest delta (0.2 in this example)
+ * float scale = 2.0f;
+ *
+ * volk_32f_s32f_power_32f(out, increasing, scale, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out[%u] = %f\n", ii, out[ii]);
+ * }
+ *
+ * volk_free(increasing);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_s32f_power_32f_a_H
+#define INCLUDED_volk_32f_s32f_power_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <tmmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+static inline void
+volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* aVector,
+ const float power, unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 vPower = _mm_set_ps1(power);
+ __m128 zeroValue = _mm_setzero_ps();
+ __m128 signMask;
+ __m128 negatedValues;
+ __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
+ __m128 onesMask = _mm_set_ps1(1);
+
+ __m128 aVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ signMask = _mm_cmplt_ps(aVal, zeroValue);
+ negatedValues = _mm_sub_ps(zeroValue, aVal);
+ aVal = _mm_blendv_ps(aVal, negatedValues, signMask);
+
+ // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
+ cVal = powf4(aVal, vPower); // Takes each input value to the specified power
+
+ cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+ for(;number < num_points; number++){
+ *cPtr++ = powf((*aPtr++), power);
+ }
+}
+
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+static inline void
+volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aVector,
+ const float power, unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 vPower = _mm_set_ps1(power);
+ __m128 zeroValue = _mm_setzero_ps();
+ __m128 signMask;
+ __m128 negatedValues;
+ __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
+ __m128 onesMask = _mm_set_ps1(1);
+
+ __m128 aVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ signMask = _mm_cmplt_ps(aVal, zeroValue);
+ negatedValues = _mm_sub_ps(zeroValue, aVal);
+ aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) );
+
+ // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
+ cVal = powf4(aVal, vPower); // Takes each input value to the specified power
+
+ cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+ for(;number < num_points; number++){
+ *cPtr++ = powf((*aPtr++), power);
+ }
+}
+
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_s32f_power_32f_generic(float* cVector, const float* aVector,
+ const float power, unsigned int num_points)
+{
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = powf((*aPtr++), power);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_s32f_power_32f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_s32f_stddev_32f
+ *
+ * \b Overview
+ *
+ * Computes the standard deviation of the input buffer using the supplied mean.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_s32f_stddev_32f(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li inputBuffer: The input vector of floats.
+ * \li mean: The mean of the input buffer.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li stddev: The output vector.
+ *
+ * \b Example
+ * Calculate the standard deviation from numbers generated with c++11's normal generator
+ * \code
+ * int N = 1000;
+ * unsigned int alignment = volk_get_alignment();
+ * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float mean = 0.0f;
+ * float* stddev = (float*)volk_malloc(sizeof(float), alignment);
+ *
+ * // Use a normal generator with 0 mean, stddev = 1
+ * std::default_random_engine generator;
+ * std::normal_distribution<float> distribution(mean,1);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * increasing[ii] = distribution(generator);
+ * }
+ *
+ * volk_32f_s32f_power_32f(stddev, increasing, mean, N);
+ *
+ * printf("std. dev. = %f\n", *stddev);
+ *
+ * volk_free(increasing);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H
+#define INCLUDED_volk_32f_s32f_stddev_32f_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, const float* inputBuffer,
+ const float mean, unsigned int num_points)
+{
+ float returnValue = 0;
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const float* aPtr = inputBuffer;
+
+ __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+
+ __m128 squareAccumulator = _mm_setzero_ps();
+ __m128 aVal1, aVal2, aVal3, aVal4;
+ __m128 cVal1, cVal2, cVal3, cVal4;
+ for(;number < sixteenthPoints; number++) {
+ aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
+
+ aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
+
+ aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
+
+ aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
+
+ cVal1 = _mm_or_ps(cVal1, cVal2);
+ cVal3 = _mm_or_ps(cVal3, cVal4);
+ cVal1 = _mm_or_ps(cVal1, cVal3);
+
+ squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+ }
+ _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+ returnValue = squareBuffer[0];
+ returnValue += squareBuffer[1];
+ returnValue += squareBuffer[2];
+ returnValue += squareBuffer[3];
+
+ number = sixteenthPoints * 16;
+ for(;number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ aPtr++;
+ }
+ returnValue /= num_points;
+ returnValue -= (mean * mean);
+ returnValue = sqrtf(returnValue);
+ }
+ *stddev = returnValue;
+}
+
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_s32f_stddev_32f_a_avx(float* stddev, const float* inputBuffer,
+ const float mean, unsigned int num_points)
+{
+ float returnValue = 0;
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int thirtySecondPoints = num_points / 32;
+
+ const float* aPtr = inputBuffer;
+
+ __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
+
+ __m256 squareAccumulator = _mm256_setzero_ps();
+ __m256 aVal1, aVal2, aVal3, aVal4;
+ __m256 cVal1, cVal2, cVal3, cVal4;
+ for(;number < thirtySecondPoints; number++) {
+ aVal1 = _mm256_load_ps(aPtr); aPtr += 8;
+ cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
+
+ aVal2 = _mm256_load_ps(aPtr); aPtr += 8;
+ cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
+
+ aVal3 = _mm256_load_ps(aPtr); aPtr += 8;
+ cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
+
+ aVal4 = _mm256_load_ps(aPtr); aPtr += 8;
+ cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
+
+ cVal1 = _mm256_or_ps(cVal1, cVal2);
+ cVal3 = _mm256_or_ps(cVal3, cVal4);
+ cVal1 = _mm256_or_ps(cVal1, cVal3);
+
+ squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+ }
+ _mm256_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+ returnValue = squareBuffer[0]; returnValue += squareBuffer[1];
+ returnValue += squareBuffer[2]; returnValue += squareBuffer[3];
+ returnValue += squareBuffer[4]; returnValue += squareBuffer[5];
+ returnValue += squareBuffer[6]; returnValue += squareBuffer[7];
+
+ number = thirtySecondPoints * 32;
+ for(;number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ aPtr++;
+ }
+ returnValue /= num_points;
+ returnValue -= (mean * mean);
+ returnValue = sqrtf(returnValue);
+ }
+ *stddev = returnValue;
+}
+
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_s32f_stddev_32f_a_sse(float* stddev, const float* inputBuffer,
+ const float mean, unsigned int num_points)
+{
+ float returnValue = 0;
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* aPtr = inputBuffer;
+
+ __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+
+ __m128 squareAccumulator = _mm_setzero_ps();
+ __m128 aVal = _mm_setzero_ps();
+ for(;number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr); // aVal = x
+ aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
+ squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
+ aPtr += 4;
+ }
+ _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+ returnValue = squareBuffer[0];
+ returnValue += squareBuffer[1];
+ returnValue += squareBuffer[2];
+ returnValue += squareBuffer[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ aPtr++;
+ }
+ returnValue /= num_points;
+ returnValue -= (mean * mean);
+ returnValue = sqrtf(returnValue);
+ }
+ *stddev = returnValue;
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_s32f_stddev_32f_generic(float* stddev, const float* inputBuffer,
+ const float mean, unsigned int num_points)
+{
+ float returnValue = 0;
+ if(num_points > 0){
+ const float* aPtr = inputBuffer;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ aPtr++;
+ }
+
+ returnValue /= num_points;
+ returnValue -= (mean * mean);
+ returnValue = sqrtf(returnValue);
+ }
+ *stddev = returnValue;
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_s32f_stddev_32f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_sin_32f
+ *
+ * \b Overview
+ *
+ * Computes the sine of the input vector and stores the results in the output vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_sin_32f(float* bVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li bVector: The output vector.
+ *
+ * \b Example
+ * Calculate sin(theta) for several common angles.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ * in[0] = 0.000;
+ * in[1] = 0.524;
+ * in[2] = 0.786;
+ * in[3] = 1.047;
+ * in[4] = 1.571;
+ * in[5] = 1.571;
+ * in[6] = 2.094;
+ * in[7] = 2.356;
+ * in[8] = 2.618;
+ * in[9] = 3.142;
+ *
+ * volk_32f_sin_32f(out, in, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("sin(%1.3f) = %1.3f\n", in[ii], out[ii]);
+ * }
+ *
+ * volk_free(in);
+ * volk_free(out);
+ * \endcode
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <inttypes.h>
+
+#ifndef INCLUDED_volk_32f_sin_32f_a_H
+#define INCLUDED_volk_32f_sin_32f_a_H
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_sin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ unsigned int i = 0;
+
+ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
+ __m128 sine, cosine, condition1, condition2;
+ __m128i q, r, ones, twos, fours;
+
+ m4pi = _mm_set1_ps(1.273239545);
+ pio4A = _mm_set1_ps(0.78515625);
+ pio4B = _mm_set1_ps(0.241876e-3);
+ ffours = _mm_set1_ps(4.0);
+ ftwos = _mm_set1_ps(2.0);
+ fones = _mm_set1_ps(1.0);
+ fzeroes = _mm_setzero_ps();
+ ones = _mm_set1_epi32(1);
+ twos = _mm_set1_epi32(2);
+ fours = _mm_set1_epi32(4);
+
+ cp1 = _mm_set1_ps(1.0);
+ cp2 = _mm_set1_ps(0.83333333e-1);
+ cp3 = _mm_set1_ps(0.2777778e-2);
+ cp4 = _mm_set1_ps(0.49603e-4);
+ cp5 = _mm_set1_ps(0.551e-6);
+
+ for(;number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr);
+ s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+ q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+ r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+ s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+ s = _mm_mul_ps(s, s);
+ // Evaluate Taylor series
+ s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+
+ for(i = 0; i < 3; i++) {
+ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+ }
+ s = _mm_div_ps(s, ftwos);
+
+ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+ cosine = _mm_sub_ps(fones, s);
+
+ condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+ condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
+ // Need this condition only for cos
+ //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
+
+ sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
+ sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
+ _mm_store_ps(bPtr, sine);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++) {
+ *bPtr++ = sin(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+
+#endif /* INCLUDED_volk_32f_sin_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_sin_32f_u_H
+#define INCLUDED_volk_32f_sin_32f_u_H
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_sin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ unsigned int i = 0;
+
+ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
+ __m128 sine, cosine, condition1, condition2;
+ __m128i q, r, ones, twos, fours;
+
+ m4pi = _mm_set1_ps(1.273239545);
+ pio4A = _mm_set1_ps(0.78515625);
+ pio4B = _mm_set1_ps(0.241876e-3);
+ ffours = _mm_set1_ps(4.0);
+ ftwos = _mm_set1_ps(2.0);
+ fones = _mm_set1_ps(1.0);
+ fzeroes = _mm_setzero_ps();
+ ones = _mm_set1_epi32(1);
+ twos = _mm_set1_epi32(2);
+ fours = _mm_set1_epi32(4);
+
+ cp1 = _mm_set1_ps(1.0);
+ cp2 = _mm_set1_ps(0.83333333e-1);
+ cp3 = _mm_set1_ps(0.2777778e-2);
+ cp4 = _mm_set1_ps(0.49603e-4);
+ cp5 = _mm_set1_ps(0.551e-6);
+
+ for(;number < quarterPoints; number++) {
+ aVal = _mm_loadu_ps(aPtr);
+ s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+ q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+ r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+ s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+ s = _mm_mul_ps(s, s);
+ // Evaluate Taylor series
+ s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+
+ for(i = 0; i < 3; i++) {
+ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+ }
+ s = _mm_div_ps(s, ftwos);
+
+ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+ cosine = _mm_sub_ps(fones, s);
+
+ condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+ condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
+
+ sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
+ sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
+ _mm_storeu_ps(bPtr, sine);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = sin(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE4_1 for unaligned */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_sin_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++) {
+ *bPtr++ = sin(*aPtr++);
+ }
+
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_sin_32f_u_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_sqrt_32f
+ *
+ * \b Overview
+ *
+ * Computes the square root of the input vector and stores the results
+ * in the output vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_sqrt_32f(float* cVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li bVector: The output vector.
+ *
+ * \b Example
+ * \code
+ int N = 10;
+ unsigned int alignment = volk_get_alignment();
+ float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+
+ for(unsigned int ii = 0; ii < N; ++ii){
+ in[ii] = (float)(ii*ii);
+ }
+
+ volk_32f_sqrt_32f(out, in, N);
+
+ for(unsigned int ii = 0; ii < N; ++ii){
+ printf("out(%i) = %f\n", ii, out[ii]);
+ }
+
+ volk_free(in);
+ volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_sqrt_32f_a_H
+#define INCLUDED_volk_32f_sqrt_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m128 aVal, cVal;
+ for(;number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr);
+
+ cVal = _mm_sqrt_ps(aVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++) {
+ *cPtr++ = sqrtf(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eigthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m256 aVal, cVal;
+ for(;number < eigthPoints; number++) {
+ aVal = _mm256_load_ps(aPtr);
+
+ cVal = _mm256_sqrt_ps(aVal);
+
+ _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eigthPoints * 8;
+ for(;number < num_points; number++) {
+ *cPtr++ = sqrtf(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
+{
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+ float32x4_t in_vec, out_vec;
+
+ for(number = 0; number < quarter_points; number++) {
+ in_vec = vld1q_f32(aPtr);
+ // note that armv8 has vsqrt_f32 which will be much better
+ out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec) );
+ vst1q_f32(cPtr, out_vec);
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ for(number = quarter_points * 4; number < num_points; number++) {
+ *cPtr++ = sqrtf(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
+{
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++) {
+ *cPtr++ = sqrtf(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_ORC
+
+extern void
+volk_32f_sqrt_32f_a_orc_impl(float *, const float*, unsigned int);
+
+static inline void
+volk_32f_sqrt_32f_u_orc(float* cVector, const float* aVector, unsigned int num_points)
+{
+ volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points);
+}
+
+#endif /* LV_HAVE_ORC */
+
+#endif /* INCLUDED_volk_32f_sqrt_32f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_stddev_and_mean_32f_x2
+ *
+ * \b Overview
+ *
+ * Computes the standard deviation and mean of the input buffer.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_stddev_and_mean_32f_x2(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li inputBuffer: The buffer of points.
+ * \li num_points The number of values in input buffer.
+ *
+ * \b Outputs
+ * \li stddev: The calculated standard deviation.
+ * \li mean: The mean of the input buffer.
+ *
+ * \b Example
+ * Generate random numbers with c++11's normal distribution and estimate the mean and standard deviation
+ * \code
+ * int N = 1000;
+ * unsigned int alignment = volk_get_alignment();
+ * float* rand_numbers = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* mean = (float*)volk_malloc(sizeof(float), alignment);
+ * float* stddev = (float*)volk_malloc(sizeof(float), alignment);
+ *
+ * // Use a normal generator with 0 mean, stddev 1
+ * std::default_random_engine generator;
+ * std::normal_distribution<float> distribution(0,1);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * rand_numbers[ii] = distribution(generator);
+ * }
+ *
+ * volk_32f_stddev_and_mean_32f_x2(stddev, mean, rand_numbers, N);
+ *
+ * printf("std. dev. = %f\n", *stddev);
+ * printf("mean = %f\n", *mean);
+ *
+ * volk_free(rand_numbers);
+ * volk_free(mean);
+ * volk_free(stddev);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
+#define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float* mean,
+ const float* inputBuffer,
+ unsigned int num_points)
+{
+ float returnValue = 0;
+ float newMean = 0;
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const float* aPtr = inputBuffer;
+ __VOLK_ATTR_ALIGNED(16) float meanBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+
+ __m128 accumulator = _mm_setzero_ps();
+ __m128 squareAccumulator = _mm_setzero_ps();
+ __m128 aVal1, aVal2, aVal3, aVal4;
+ __m128 cVal1, cVal2, cVal3, cVal4;
+ for(;number < sixteenthPoints; number++) {
+ aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
+ accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x
+
+ aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
+ accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x
+
+ aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
+ accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x
+
+ aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+ cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
+ accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x
+
+ cVal1 = _mm_or_ps(cVal1, cVal2);
+ cVal3 = _mm_or_ps(cVal3, cVal4);
+ cVal1 = _mm_or_ps(cVal1, cVal3);
+
+ squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+ }
+ _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
+ _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+ newMean = meanBuffer[0];
+ newMean += meanBuffer[1];
+ newMean += meanBuffer[2];
+ newMean += meanBuffer[3];
+ returnValue = squareBuffer[0];
+ returnValue += squareBuffer[1];
+ returnValue += squareBuffer[2];
+ returnValue += squareBuffer[3];
+
+ number = sixteenthPoints * 16;
+ for(;number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ newMean += *aPtr++;
+ }
+ newMean /= num_points;
+ returnValue /= num_points;
+ returnValue -= (newMean * newMean);
+ returnValue = sqrtf(returnValue);
+ }
+ *stddev = returnValue;
+ *mean = newMean;
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, float* mean,
+ const float* inputBuffer,
+ unsigned int num_points)
+{
+ float returnValue = 0;
+ float newMean = 0;
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* aPtr = inputBuffer;
+ __VOLK_ATTR_ALIGNED(16) float meanBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+
+ __m128 accumulator = _mm_setzero_ps();
+ __m128 squareAccumulator = _mm_setzero_ps();
+ __m128 aVal = _mm_setzero_ps();
+ for(;number < quarterPoints; number++) {
+ aVal = _mm_load_ps(aPtr); // aVal = x
+ accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x
+ aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
+ squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
+ aPtr += 4;
+ }
+ _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
+ _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+ newMean = meanBuffer[0];
+ newMean += meanBuffer[1];
+ newMean += meanBuffer[2];
+ newMean += meanBuffer[3];
+ returnValue = squareBuffer[0];
+ returnValue += squareBuffer[1];
+ returnValue += squareBuffer[2];
+ returnValue += squareBuffer[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ newMean += *aPtr++;
+ }
+ newMean /= num_points;
+ returnValue /= num_points;
+ returnValue -= (newMean * newMean);
+ returnValue = sqrtf(returnValue);
+ }
+ *stddev = returnValue;
+ *mean = newMean;
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_stddev_and_mean_32f_x2_generic(float* stddev, float* mean,
+ const float* inputBuffer,
+ unsigned int num_points)
+{
+ float returnValue = 0;
+ float newMean = 0;
+ if(num_points > 0){
+ const float* aPtr = inputBuffer;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ returnValue += (*aPtr) * (*aPtr);
+ newMean += *aPtr++;
+ }
+ newMean /= num_points;
+ returnValue /= num_points;
+ returnValue -= (newMean * newMean);
+ returnValue = sqrtf(returnValue);
+ }
+ *stddev = returnValue;
+ *mean = newMean;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_tan_32f
+ *
+ * \b Overview
+ *
+ * Computes the tangent of each element of the aVector.
+ *
+ * b[i] = tan(a[i])
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_tan_32f(float* bVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The buffer of points.
+ * \li num_points: The number of values in input buffer.
+ *
+ * \b Outputs
+ * \li bVector: The output buffer.
+ *
+ * \b Example
+ * Calculate tan(theta) for common angles.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ * in[0] = 0.000;
+ * in[1] = 0.524;
+ * in[2] = 0.785;
+ * in[3] = 1.047;
+ * in[4] = 1.571 ;
+ * in[5] = 1.571 ;
+ * in[6] = -1.047;
+ * in[7] = -0.785;
+ * in[8] = -0.524;
+ * in[9] = -0.000;
+ *
+ * volk_32f_tan_32f(out, in, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("tan(%1.3f) = %1.3f\n", in[ii], out[ii]);
+ * }
+ *
+ * volk_free(in);
+ * volk_free(out);
+ * \endcode
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <inttypes.h>
+
+#ifndef INCLUDED_volk_32f_tan_32f_a_H
+#define INCLUDED_volk_32f_tan_32f_a_H
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector,
+ unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ unsigned int i = 0;
+
+ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
+ __m128 sine, cosine, tangent, condition1, condition2, condition3;
+ __m128i q, r, ones, twos, fours;
+
+ m4pi = _mm_set1_ps(1.273239545);
+ pio4A = _mm_set1_ps(0.78515625);
+ pio4B = _mm_set1_ps(0.241876e-3);
+ ffours = _mm_set1_ps(4.0);
+ ftwos = _mm_set1_ps(2.0);
+ fones = _mm_set1_ps(1.0);
+ fzeroes = _mm_setzero_ps();
+ ones = _mm_set1_epi32(1);
+ twos = _mm_set1_epi32(2);
+ fours = _mm_set1_epi32(4);
+
+ cp1 = _mm_set1_ps(1.0);
+ cp2 = _mm_set1_ps(0.83333333e-1);
+ cp3 = _mm_set1_ps(0.2777778e-2);
+ cp4 = _mm_set1_ps(0.49603e-4);
+ cp5 = _mm_set1_ps(0.551e-6);
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_load_ps(aPtr);
+ s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+ q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+ r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+ s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+ s = _mm_mul_ps(s, s);
+ // Evaluate Taylor series
+ s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+
+ for(i = 0; i < 3; i++){
+ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+ }
+ s = _mm_div_ps(s, ftwos);
+
+ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+ cosine = _mm_sub_ps(fones, s);
+
+ condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+ condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
+ condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
+
+ __m128 temp = cosine;
+ cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
+ sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
+ sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
+ cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
+ tangent = _mm_div_ps(sine, cosine);
+ _mm_store_ps(bPtr, tangent);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = tan(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+
+#endif /* INCLUDED_volk_32f_tan_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_tan_32f_u_H
+#define INCLUDED_volk_32f_tan_32f_u_H
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_tan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ unsigned int i = 0;
+
+ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
+ __m128 sine, cosine, tangent, condition1, condition2, condition3;
+ __m128i q, r, ones, twos, fours;
+
+ m4pi = _mm_set1_ps(1.273239545);
+ pio4A = _mm_set1_ps(0.78515625);
+ pio4B = _mm_set1_ps(0.241876e-3);
+ ffours = _mm_set1_ps(4.0);
+ ftwos = _mm_set1_ps(2.0);
+ fones = _mm_set1_ps(1.0);
+ fzeroes = _mm_setzero_ps();
+ ones = _mm_set1_epi32(1);
+ twos = _mm_set1_epi32(2);
+ fours = _mm_set1_epi32(4);
+
+ cp1 = _mm_set1_ps(1.0);
+ cp2 = _mm_set1_ps(0.83333333e-1);
+ cp3 = _mm_set1_ps(0.2777778e-2);
+ cp4 = _mm_set1_ps(0.49603e-4);
+ cp5 = _mm_set1_ps(0.551e-6);
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_loadu_ps(aPtr);
+ s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+ q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+ r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+ s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+ s = _mm_mul_ps(s, s);
+ // Evaluate Taylor series
+ s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+
+ for(i = 0; i < 3; i++){
+ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+ }
+ s = _mm_div_ps(s, ftwos);
+
+ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+ cosine = _mm_sub_ps(fones, s);
+
+ condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+ condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
+ condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
+
+ __m128 temp = cosine;
+ cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
+ sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
+ sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
+ cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
+ tangent = _mm_div_ps(sine, cosine);
+ _mm_storeu_ps(bPtr, tangent);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = tan(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE4_1 for unaligned */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_tan_32f_generic(float* bVector, const float* aVector,
+ unsigned int num_points)
+{
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(; number < num_points; number++){
+ *bPtr++ = tan(*aPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_tan_32f_u_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_tanh_32f
+ *
+ * \b Overview
+ *
+ * Computes the hyperbolic tangent of each element of the aVector:
+ *
+ * c[i] = tanh(a[i])
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_tanh_32f(float* cVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The buffer of points.
+ * \li num_points: The number of values in input buffer.
+ *
+ * \b Outputs
+ * \li cVector: The output buffer.
+ *
+ * \b Example
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * // the approximate artanh(x) for x<1
+ * float x = (float)ii / (float)N;
+ * in[ii] = 0.5 * std::log((1.f+x)/(1.f-x));
+ * }
+ *
+ * volk_32f_tanh_32f(out, in, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out(%i) = %f\n", ii, out[ii]);
+ * }
+ *
+ * volk_free(in);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_tanh_32f_a_H
+#define INCLUDED_volk_32f_tanh_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_tanh_32f_generic(float* cVector, const float* aVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ for(; number < num_points; number++) {
+ *cPtr++ = tanh(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_tanh_32f_series(float* cVector, const float* aVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ for(; number < num_points; number++) {
+ if(*aPtr > 4.97)
+ *cPtr++ = 1;
+ else if(*aPtr <= -4.97)
+ *cPtr++ = -1;
+ else {
+ float x2 = (*aPtr) * (*aPtr);
+ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+ *cPtr++ = a / b;
+ aPtr++;
+ }
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m128 aVal, cVal, x2, a, b;
+ __m128 const1, const2, const3, const4, const5, const6;
+ const1 = _mm_set_ps1(135135.0f);
+ const2 = _mm_set_ps1(17325.0f);
+ const3 = _mm_set_ps1(378.0f);
+ const4 = _mm_set_ps1(62370.0f);
+ const5 = _mm_set_ps1(3150.0f);
+ const6 = _mm_set_ps1(28.0f);
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ x2 = _mm_mul_ps(aVal, aVal);
+ a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
+ b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
+
+ cVal = _mm_div_ps(a, b);
+
+ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++) {
+ if(*aPtr > 4.97)
+ *cPtr++ = 1;
+ else if(*aPtr <= -4.97)
+ *cPtr++ = -1;
+ else {
+ float x2 = (*aPtr) * (*aPtr);
+ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+ *cPtr++ = a / b;
+ aPtr++;
+ }
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m256 aVal, cVal, x2, a, b;
+ __m256 const1, const2, const3, const4, const5, const6;
+ const1 = _mm256_set1_ps(135135.0f);
+ const2 = _mm256_set1_ps(17325.0f);
+ const3 = _mm256_set1_ps(378.0f);
+ const4 = _mm256_set1_ps(62370.0f);
+ const5 = _mm256_set1_ps(3150.0f);
+ const6 = _mm256_set1_ps(28.0f);
+ for(;number < eighthPoints; number++){
+
+ aVal = _mm256_load_ps(aPtr);
+ x2 = _mm256_mul_ps(aVal, aVal);
+ a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
+ b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
+
+ cVal = _mm256_div_ps(a, b);
+
+ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++) {
+ if(*aPtr > 4.97)
+ *cPtr++ = 1;
+ else if(*aPtr <= -4.97)
+ *cPtr++ = -1;
+ else {
+ float x2 = (*aPtr) * (*aPtr);
+ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+ *cPtr++ = a / b;
+ aPtr++;
+ }
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m128 aVal, cVal, x2, a, b;
+ __m128 const1, const2, const3, const4, const5, const6;
+ const1 = _mm_set_ps1(135135.0f);
+ const2 = _mm_set_ps1(17325.0f);
+ const3 = _mm_set_ps1(378.0f);
+ const4 = _mm_set_ps1(62370.0f);
+ const5 = _mm_set_ps1(3150.0f);
+ const6 = _mm_set_ps1(28.0f);
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_loadu_ps(aPtr);
+ x2 = _mm_mul_ps(aVal, aVal);
+ a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
+ b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
+
+ cVal = _mm_div_ps(a, b);
+
+ _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++) {
+ if(*aPtr > 4.97)
+ *cPtr++ = 1;
+ else if(*aPtr <= -4.97)
+ *cPtr++ = -1;
+ else {
+ float x2 = (*aPtr) * (*aPtr);
+ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+ *cPtr++ = a / b;
+ aPtr++;
+ }
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m256 aVal, cVal, x2, a, b;
+ __m256 const1, const2, const3, const4, const5, const6;
+ const1 = _mm256_set1_ps(135135.0f);
+ const2 = _mm256_set1_ps(17325.0f);
+ const3 = _mm256_set1_ps(378.0f);
+ const4 = _mm256_set1_ps(62370.0f);
+ const5 = _mm256_set1_ps(3150.0f);
+ const6 = _mm256_set1_ps(28.0f);
+ for(;number < eighthPoints; number++){
+
+ aVal = _mm256_loadu_ps(aPtr);
+ x2 = _mm256_mul_ps(aVal, aVal);
+ a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
+ b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
+
+ cVal = _mm256_div_ps(a, b);
+
+ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++) {
+ if(*aPtr > 4.97)
+ *cPtr++ = 1;
+ else if(*aPtr <= -4.97)
+ *cPtr++ = -1;
+ else {
+ float x2 = (*aPtr) * (*aPtr);
+ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+ *cPtr++ = a / b;
+ aPtr++;
+ }
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_32f_tanh_32f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_x2_add_32f
+ *
+ * \b Overview
+ *
+ * Adds two vectors together element by element:
+ *
+ * c[i] = a[i] + b[i]
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_x2_add_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: First vector of input points.
+ * \li bVector: Second vector of input points.
+ * \li num_points: The number of values in both input vector.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ *
+ * The follow example adds the increasing and decreasing vectors such that the result of every summation pair is 10
+ *
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* decreasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * increasing[ii] = (float)ii;
+ * decreasing[ii] = 10.f - (float)ii;
+ * }
+ *
+ * volk_32f_x2_add_32f(out, increasing, decreasing, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out[%u] = %1.2f\n", ii, out[ii]);
+ * }
+ *
+ * volk_free(increasing);
+ * volk_free(decreasing);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_x2_add_32f_u_H
+#define INCLUDED_volk_32f_x2_add_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_loadu_ps(aPtr);
+ bVal = _mm_loadu_ps(bPtr);
+
+ cVal = _mm_add_ps(aVal, bVal);
+
+ _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_x2_add_32f_generic(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_x2_add_32f_u_H */
+#ifndef INCLUDED_volk_32f_x2_add_32f_a_H
+#define INCLUDED_volk_32f_x2_add_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_add_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32f_x2_add_32f_u_neon(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ float32x4_t aVal, bVal, cVal;
+ for(number=0; number < quarterPoints; number++){
+ // Load in to NEON registers
+ aVal = vld1q_f32(aPtr);
+ bVal = vld1q_f32(bPtr);
+ __VOLK_PREFETCH(aPtr+4);
+ __VOLK_PREFETCH(bPtr+4);
+
+ // vector add
+ cVal = vaddq_f32(aVal, bVal);
+ // Store the results back into the C container
+ vst1q_f32(cPtr,cVal);
+
+ aPtr += 4; // q uses quadwords, 4 floats per vadd
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4; // should be = num_points
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEON
+extern void volk_32f_x2_add_32f_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEON
+extern void volk_32f_x2_add_32f_neonpipeline(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_x2_add_32f_a_generic(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_ORC
+
+extern void
+volk_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points);
+
+static inline void
+volk_32f_x2_add_32f_u_orc(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points){
+ volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_add_32f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_x2_divide_32f
+ *
+ * \b Overview
+ *
+ * Divides aVector by bVector to produce cVector:
+ *
+ * c[i] = a[i] / b[i]
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_x2_divide_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: First vector of input points.
+ * \li bVector: Second vector of input points.
+ * \li num_points: The number of values in both input vector.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * Divide an increasing vector by a decreasing vector
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* decreasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * increasing[ii] = (float)ii;
+ * decreasing[ii] = 10.f - (float)ii;
+ * }
+ *
+ * volk_32f_x2_divide_32f(out, increasing, decreasing, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out[%u] = %1.2f\n", ii, out[ii]);
+ * }
+ *
+ * volk_free(increasing);
+ * volk_free(decreasing);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_x2_divide_32f_a_H
+#define INCLUDED_volk_32f_x2_divide_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_x2_divide_32f_a_sse(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_div_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) / (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_x2_divide_32f_a_avx(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eigthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m256 aVal, bVal, cVal;
+ for(;number < eigthPoints; number++){
+ aVal = _mm256_load_ps(aPtr);
+ bVal = _mm256_load_ps(bPtr);
+
+ cVal = _mm256_div_ps(aVal, bVal);
+
+ _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eigthPoints * 8;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) / (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_x2_divide_32f_generic(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) / (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_ORC
+
+extern void
+volk_32f_x2_divide_32f_a_orc_impl(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points);
+
+static inline void
+volk_32f_x2_divide_32f_u_orc(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+#endif /* INCLUDED_volk_32f_x2_divide_32f_a_H */
+
+
+#ifndef INCLUDED_volk_32f_x2_divide_32f_u_H
+#define INCLUDED_volk_32f_x2_divide_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_x2_divide_32f_u_avx(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eigthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m256 aVal, bVal, cVal;
+ for(;number < eigthPoints; number++){
+ aVal = _mm256_loadu_ps(aPtr);
+ bVal = _mm256_loadu_ps(bPtr);
+
+ cVal = _mm256_div_ps(aVal, bVal);
+
+ _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eigthPoints * 8;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) / (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_32f_x2_divide_32f_u_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_x2_dot_prod_16i
+ *
+ * \b Overview
+ *
+ * This block computes the dot product (or inner product) between two
+ * vectors, the \p input and \p taps vectors. Given a set of \p
+ * num_points taps, the result is the sum of products between the two
+ * vectors. The result is a single value stored in the \p result
+ * address and is conerted to a fixed-point short.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_x2_dot_prod_16i(int16_t* result, const float* input, const float* taps, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li input: vector of floats.
+ * \li taps: float taps.
+ * \li num_points: number of samples in both \p input and \p taps.
+ *
+ * \b Outputs
+ * \li result: pointer to a short value to hold the dot product result.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * <FIXME>
+ *
+ * volk_32f_x2_dot_prod_16i();
+ *
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H
+#define INCLUDED_volk_32f_x2_dot_prod_16i_H
+
+#include <volk/volk_common.h>
+#include <stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = (int16_t)dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_AVX
+
+static inline void volk_32f_x2_dot_prod_16i_a_avx(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int thirtySecondPoints = num_points / 32;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val, a2Val, a3Val;
+ __m256 b0Val, b1Val, b2Val, b3Val;
+ __m256 c0Val, c1Val, c2Val, c3Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+ __m256 dotProdVal2 = _mm256_setzero_ps();
+ __m256 dotProdVal3 = _mm256_setzero_ps();
+
+ for(;number < thirtySecondPoints; number++){
+
+ a0Val = _mm256_load_ps(aPtr);
+ a1Val = _mm256_load_ps(aPtr+8);
+ a2Val = _mm256_load_ps(aPtr+16);
+ a3Val = _mm256_load_ps(aPtr+24);
+
+ b0Val = _mm256_load_ps(bPtr);
+ b1Val = _mm256_load_ps(bPtr+8);
+ b2Val = _mm256_load_ps(bPtr+16);
+ b3Val = _mm256_load_ps(bPtr+24);
+
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
+ c2Val = _mm256_mul_ps(a2Val, b2Val);
+ c3Val = _mm256_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 32;
+ bPtr += 32;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+ dotProduct += dotProductVector[4];
+ dotProduct += dotProductVector[5];
+ dotProduct += dotProductVector[6];
+ dotProduct += dotProductVector[7];
+
+ number = thirtySecondPoints*32;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = (short)dotProduct;
+}
+
+#endif /*LV_HAVE_AVX*/
+
+
+#ifdef LV_HAVE_AVX
+
+static inline void volk_32f_x2_dot_prod_16i_u_avx(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int thirtySecondPoints = num_points / 32;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val, a2Val, a3Val;
+ __m256 b0Val, b1Val, b2Val, b3Val;
+ __m256 c0Val, c1Val, c2Val, c3Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+ __m256 dotProdVal2 = _mm256_setzero_ps();
+ __m256 dotProdVal3 = _mm256_setzero_ps();
+
+ for(;number < thirtySecondPoints; number++){
+
+ a0Val = _mm256_loadu_ps(aPtr);
+ a1Val = _mm256_loadu_ps(aPtr+8);
+ a2Val = _mm256_loadu_ps(aPtr+16);
+ a3Val = _mm256_loadu_ps(aPtr+24);
+
+ b0Val = _mm256_loadu_ps(bPtr);
+ b1Val = _mm256_loadu_ps(bPtr+8);
+ b2Val = _mm256_loadu_ps(bPtr+16);
+ b3Val = _mm256_loadu_ps(bPtr+24);
+
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
+ c2Val = _mm256_mul_ps(a2Val, b2Val);
+ c3Val = _mm256_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 32;
+ bPtr += 32;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+ dotProduct += dotProductVector[4];
+ dotProduct += dotProductVector[5];
+ dotProduct += dotProductVector[6];
+ dotProduct += dotProductVector[7];
+
+ number = thirtySecondPoints*32;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = (short)dotProduct;
+}
+
+#endif /*LV_HAVE_AVX*/
+
+
+#ifdef LV_HAVE_SSE
+
+static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = (short)dotProduct;
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+#ifdef LV_HAVE_SSE
+
+static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_loadu_ps(aPtr);
+ a1Val = _mm_loadu_ps(aPtr+4);
+ a2Val = _mm_loadu_ps(aPtr+8);
+ a3Val = _mm_loadu_ps(aPtr+12);
+ b0Val = _mm_loadu_ps(bPtr);
+ b1Val = _mm_loadu_ps(bPtr+4);
+ b2Val = _mm_loadu_ps(bPtr+8);
+ b3Val = _mm_loadu_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = (short)dotProduct;
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_16i_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_x2_dot_prod_32f
+ *
+ * \b Overview
+ *
+ * This block computes the dot product (or inner product) between two
+ * vectors, the \p input and \p taps vectors. Given a set of \p
+ * num_points taps, the result is the sum of products between the two
+ * vectors. The result is a single value stored in the \p result
+ * address and is returned as a float.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_x2_dot_prod_32f(float* result, const float* input, const float* taps, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li input: vector of floats.
+ * \li taps: float taps.
+ * \li num_points: number of samples in both \p input and \p taps.
+ *
+ * \b Outputs
+ * \li result: pointer to a float value to hold the dot product result.
+ *
+ * \b Example
+ * Take the dot product of an increasing vector and a vector of ones. The result is the sum of integers (0,9).
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* ones = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*1, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * increasing[ii] = (float)ii;
+ * ones[ii] = 1.f;
+ * }
+ *
+ * volk_32f_x2_dot_prod_32f(out, increasing, ones, N);
+ *
+ * printf("out = %1.2f\n", *out);
+ *
+ * volk_free(increasing);
+ * volk_free(ones);
+ * volk_free(out);
+ *
+ * return 0;
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
+#define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_32f_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_loadu_ps(aPtr);
+ a1Val = _mm_loadu_ps(aPtr+4);
+ a2Val = _mm_loadu_ps(aPtr+8);
+ a3Val = _mm_loadu_ps(aPtr+12);
+ b0Val = _mm_loadu_ps(bPtr);
+ b1Val = _mm_loadu_ps(bPtr+4);
+ b2Val = _mm_loadu_ps(bPtr+8);
+ b3Val = _mm_loadu_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_loadu_ps(aPtr);
+ a1Val = _mm_loadu_ps(aPtr+4);
+ a2Val = _mm_loadu_ps(aPtr+8);
+ a3Val = _mm_loadu_ps(aPtr+12);
+ b0Val = _mm_loadu_ps(bPtr);
+ b1Val = _mm_loadu_ps(bPtr+4);
+ b2Val = _mm_loadu_ps(bPtr+8);
+ b3Val = _mm_loadu_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+ dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+ dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+ dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 aVal1, bVal1, cVal1;
+ __m128 aVal2, bVal2, cVal2;
+ __m128 aVal3, bVal3, cVal3;
+ __m128 aVal4, bVal4, cVal4;
+
+ __m128 dotProdVal = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
+
+ bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
+
+ cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+ cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+ cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+ cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+ cVal1 = _mm_or_ps(cVal1, cVal2);
+ cVal3 = _mm_or_ps(cVal3, cVal4);
+ cVal1 = _mm_or_ps(cVal1, cVal3);
+
+ dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+ }
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints * 16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val;
+ __m256 b0Val, b1Val;
+ __m256 c0Val, c1Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm256_loadu_ps(aPtr);
+ a1Val = _mm256_loadu_ps(aPtr+8);
+ b0Val = _mm256_loadu_ps(bPtr);
+ b1Val = _mm256_loadu_ps(bPtr+8);
+
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
+
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+ dotProduct += dotProductVector[4];
+ dotProduct += dotProductVector[5];
+ dotProduct += dotProductVector[6];
+ dotProduct += dotProductVector[7];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_AVX*/
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/
+#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
+#define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+ dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+ dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+ dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 aVal1, bVal1, cVal1;
+ __m128 aVal2, bVal2, cVal2;
+ __m128 aVal3, bVal3, cVal3;
+ __m128 aVal4, bVal4, cVal4;
+
+ __m128 dotProdVal = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+
+ bVal1 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal2 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal3 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal4 = _mm_load_ps(bPtr); bPtr += 4;
+
+ cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+ cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+ cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+ cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+ cVal1 = _mm_or_ps(cVal1, cVal2);
+ cVal3 = _mm_or_ps(cVal3, cVal4);
+ cVal1 = _mm_or_ps(cVal1, cVal3);
+
+ dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+ }
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints * 16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val;
+ __m256 b0Val, b1Val;
+ __m256 c0Val, c1Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm256_load_ps(aPtr);
+ a1Val = _mm256_load_ps(aPtr+8);
+ b0Val = _mm256_load_ps(bPtr);
+ b1Val = _mm256_load_ps(bPtr+8);
+
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
+
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+ dotProduct += dotProductVector[4];
+ dotProduct += dotProductVector[5];
+ dotProduct += dotProductVector[6];
+ dotProduct += dotProductVector[7];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+#endif /*LV_HAVE_AVX*/
+
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(float * result, const float * input, const float* taps, unsigned int num_points){
+ unsigned int number;
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m256 dotProdVal = _mm256_setzero_ps();
+ __m256 aVal1, bVal1;
+
+ for (number = 0; number < eighthPoints; number++ ) {
+
+ aVal1 = _mm256_load_ps(aPtr);
+ bVal1 = _mm256_load_ps(bPtr);
+ aPtr += 8;
+ bPtr += 8;
+
+ dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
+ }
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+ _mm256_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+ _mm256_zeroupper();
+
+ float dotProduct =
+ dotProductVector[0] + dotProductVector[1] +
+ dotProductVector[2] + dotProductVector[3] +
+ dotProductVector[4] + dotProductVector[5] +
+ dotProductVector[6] + dotProductVector[7];
+
+ for(number = eighthPoints * 8; number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+ unsigned int quarter_points = num_points / 16;
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ float32x4x4_t a_val, b_val, accumulator0;
+ accumulator0.val[0] = vdupq_n_f32(0);
+ accumulator0.val[1] = vdupq_n_f32(0);
+ accumulator0.val[2] = vdupq_n_f32(0);
+ accumulator0.val[3] = vdupq_n_f32(0);
+ // factor of 4 loop unroll with independent accumulators
+ // uses 12 out of 16 neon q registers
+ for( number = 0; number < quarter_points; ++number) {
+ a_val = vld4q_f32(aPtr);
+ b_val = vld4q_f32(bPtr);
+ accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
+ accumulator0.val[1] = vmlaq_f32(accumulator0.val[1], a_val.val[1], b_val.val[1]);
+ accumulator0.val[2] = vmlaq_f32(accumulator0.val[2], a_val.val[2], b_val.val[2]);
+ accumulator0.val[3] = vmlaq_f32(accumulator0.val[3], a_val.val[3], b_val.val[3]);
+ aPtr += 16;
+ bPtr += 16;
+ }
+ accumulator0.val[0] = vaddq_f32(accumulator0.val[0], accumulator0.val[1]);
+ accumulator0.val[2] = vaddq_f32(accumulator0.val[2], accumulator0.val[3]);
+ accumulator0.val[0] = vaddq_f32(accumulator0.val[2], accumulator0.val[0]);
+ __VOLK_ATTR_ALIGNED(32) float accumulator[4];
+ vst1q_f32(accumulator, accumulator0.val[0]);
+ dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
+
+ for(number = quarter_points*16; number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif
+
+
+
+
+#ifdef LV_HAVE_NEON
+static inline void volk_32f_x2_dot_prod_32f_neon(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+ unsigned int quarter_points = num_points / 8;
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ float32x4x2_t a_val, b_val, accumulator_val;
+ accumulator_val.val[0] = vdupq_n_f32(0);
+ accumulator_val.val[1] = vdupq_n_f32(0);
+ // factor of 2 loop unroll with independent accumulators
+ for( number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32(aPtr);
+ b_val = vld2q_f32(bPtr);
+ accumulator_val.val[0] = vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
+ accumulator_val.val[1] = vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
+ aPtr += 8;
+ bPtr += 8;
+ }
+ accumulator_val.val[0] = vaddq_f32(accumulator_val.val[0], accumulator_val.val[1]);
+ __VOLK_ATTR_ALIGNED(32) float accumulator[4];
+ vst1q_f32(accumulator, accumulator_val.val[0]);
+ dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
+
+ for(number = quarter_points*8; number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEON
+extern void volk_32f_x2_dot_prod_32f_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEON
+extern void volk_32f_x2_dot_prod_32f_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+#endif /* LV_HAVE_NEON */
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * VOLK is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * VOLK is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with VOLK; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_volk_32f_x2_fm_detectpuppet_32f_H
+#define INCLUDED_volk_32f_x2_fm_detectpuppet_32f_H
+
+#include "volk_32f_s32f_32f_fm_detect_32f.h"
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_x2_fm_detectpuppet_32f_a_sse(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points)
+{
+ const float bound = 1.0f;
+
+ volk_32f_s32f_32f_fm_detect_32f_a_sse(outputVector, inputVector, bound, saveValue, num_points);
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_x2_fm_detectpuppet_32f_generic(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points)
+{
+ const float bound = 1.0f;
+
+ volk_32f_s32f_32f_fm_detect_32f_generic(outputVector, inputVector, bound, saveValue, num_points);
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_x2_fm_detectpuppet_32f_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_x2_interleave_32fc
+ *
+ * \b Overview
+ *
+ * Takes input vector iBuffer as the real (inphase) part and input
+ * vector qBuffer as the imag (quadrature) part and combines them into
+ * a complex output vector.
+ *
+ * c[i] = complex(a[i], b[i])
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_x2_interleave_32fc(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li iBuffer: Input vector of samples for the real part.
+ * \li qBuffer: Input vector of samples for the imaginary part.
+ * \li num_points: The number of values in both input vectors.
+ *
+ * \b Outputs
+ * \li complexVector: The output vector of complex numbers.
+ *
+ * \b Example
+ * Generate the top half of the unit circle with real points equally spaced.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* imag = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* real = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * real[ii] = 2.f * ((float)ii / (float)N) - 1.f;
+ * imag[ii] = std::sqrt(1.f - real[ii] * real[ii]);
+ * }
+ *
+ * volk_32f_x2_interleave_32fc(out, imag, real, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out[%u] = %1.2f + %1.2fj\n", ii, std::real(out[ii]), std::imag(out[ii]));
+ * }
+ *
+ * volk_free(imag);
+ * volk_free(real);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_x2_interleave_32fc_a_H
+#define INCLUDED_volk_32f_x2_interleave_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, const float* iBuffer,
+ const float* qBuffer, unsigned int num_points)
+{
+ unsigned int number = 0;
+ float* complexVectorPtr = (float*)complexVector;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+
+ const uint64_t quarterPoints = num_points / 4;
+
+ __m128 iValue, qValue, cplxValue;
+ for(;number < quarterPoints; number++){
+ iValue = _mm_load_ps(iBufferPtr);
+ qValue = _mm_load_ps(qBufferPtr);
+
+ // Interleaves the lower two values in the i and q variables into one buffer
+ cplxValue = _mm_unpacklo_ps(iValue, qValue);
+ _mm_store_ps(complexVectorPtr, cplxValue);
+ complexVectorPtr += 4;
+
+ // Interleaves the upper two values in the i and q variables into one buffer
+ cplxValue = _mm_unpackhi_ps(iValue, qValue);
+ _mm_store_ps(complexVectorPtr, cplxValue);
+ complexVectorPtr += 4;
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ *complexVectorPtr++ = *iBufferPtr++;
+ *complexVectorPtr++ = *qBufferPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32f_x2_interleave_32fc_neon(lv_32fc_t* complexVector, const float* iBuffer,
+ const float* qBuffer, unsigned int num_points)
+{
+ unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+ float* complexVectorPtr = (float*) complexVector;
+
+ float32x4x2_t complex_vec;
+ for(number=0; number < quarter_points; ++number) {
+ complex_vec.val[0] = vld1q_f32(iBuffer);
+ complex_vec.val[1] = vld1q_f32(qBuffer);
+ vst2q_f32(complexVectorPtr, complex_vec);
+ iBuffer += 4;
+ qBuffer += 4;
+ complexVectorPtr += 8;
+ }
+
+ for(number=quarter_points * 4; number < num_points; ++number) {
+ *complexVectorPtr++ = *iBuffer++;
+ *complexVectorPtr++ = *qBuffer++;
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_x2_interleave_32fc_generic(lv_32fc_t* complexVector, const float* iBuffer,
+ const float* qBuffer, unsigned int num_points)
+{
+ float* complexVectorPtr = (float*)complexVector;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+ unsigned int number;
+
+ for(number = 0; number < num_points; number++){
+ *complexVectorPtr++ = *iBufferPtr++;
+ *complexVectorPtr++ = *qBufferPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+#endif /* INCLUDED_volk_32f_x2_interleave_32fc_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_x2_max_32f
+ *
+ * \b Overview
+ *
+ * Selects maximum value from each entry between bVector and aVector
+ * and store their results in the cVector.
+ *
+ * c[i] = max(a[i], b[i])
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_x2_max_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: First input vector.
+ * \li bVector: Second input vector.
+ * \li num_points: The number of values in both input vectors.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* decreasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * increasing[ii] = (float)ii;
+ * decreasing[ii] = 10.f - (float)ii;
+ * }
+ *
+ * volk_32f_x2_max_32f(out, increasing, decreasing, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out[%u] = %1.2f\n", ii, out[ii]);
+ * }
+ *
+ * volk_free(increasing);
+ * volk_free(decreasing);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_x2_max_32f_a_H
+#define INCLUDED_volk_32f_x2_max_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_x2_max_32f_a_sse(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_max_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a > b ? a : b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_x2_max_32f_a_avx(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eigthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m256 aVal, bVal, cVal;
+ for(;number < eigthPoints; number++){
+ aVal = _mm256_load_ps(aPtr);
+ bVal = _mm256_load_ps(bPtr);
+
+ cVal = _mm256_max_ps(aVal, bVal);
+
+ _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eigthPoints * 8;
+ for(;number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a > b ? a : b);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32f_x2_max_32f_neon(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ unsigned int quarter_points = num_points / 4;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ float32x4_t a_vec, b_vec, c_vec;
+ for(number = 0; number < quarter_points; number++){
+ a_vec = vld1q_f32(aPtr);
+ b_vec = vld1q_f32(bPtr);
+ c_vec = vmaxq_f32(a_vec, b_vec);
+ vst1q_f32(cPtr, c_vec);
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ for(number = quarter_points*4; number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a > b ? a : b);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_x2_max_32f_generic(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a > b ? a : b);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_ORC
+extern void
+volk_32f_x2_max_32f_a_orc_impl(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points);
+
+static inline void
+volk_32f_x2_max_32f_u_orc(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_max_32f_a_H */
+
+
+#ifndef INCLUDED_volk_32f_x2_max_32f_u_H
+#define INCLUDED_volk_32f_x2_max_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_x2_max_32f_u_avx(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eigthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m256 aVal, bVal, cVal;
+ for(;number < eigthPoints; number++){
+ aVal = _mm256_loadu_ps(aPtr);
+ bVal = _mm256_loadu_ps(bPtr);
+
+ cVal = _mm256_max_ps(aVal, bVal);
+
+ _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eigthPoints * 8;
+ for(;number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a > b ? a : b);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_32f_x2_max_32f_u_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_x2_min_32f
+ *
+ * \b Overview
+ *
+ * Selects minimum value from each entry between bVector and aVector
+ * and store their results in the cVector
+ *
+ * c[i] = max(a[i], b[i])
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_x2_min_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: First input vector.
+ * \li bVector: Second input vector.
+ * \li num_points: The number of values in both input vectors.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* decreasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * increasing[ii] = (float)ii;
+ * decreasing[ii] = 10.f - (float)ii;
+ * }
+ *
+ * volk_32f_x2_min_32f(out, increasing, decreasing, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out[%u] = %1.2f\n", ii, out[ii]);
+ * }
+ *
+ * volk_free(increasing);
+ * volk_free(decreasing);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_x2_min_32f_a_H
+#define INCLUDED_volk_32f_x2_min_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_min_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a < b ? a : b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_x2_min_32f_a_avx(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eigthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m256 aVal, bVal, cVal;
+ for(;number < eigthPoints; number++){
+ aVal = _mm256_load_ps(aPtr);
+ bVal = _mm256_load_ps(bPtr);
+
+ cVal = _mm256_min_ps(aVal, bVal);
+
+ _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eigthPoints * 8;
+ for(;number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a < b ? a : b);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32f_x2_min_32f_neon(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+
+ float32x4_t a_vec, b_vec, c_vec;
+ for(number = 0; number < quarter_points; number++){
+ a_vec = vld1q_f32(aPtr);
+ b_vec = vld1q_f32(bPtr);
+
+ c_vec = vminq_f32(a_vec, b_vec);
+
+ vst1q_f32(cPtr, c_vec);
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ for(number = quarter_points*4; number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a < b ? a : b);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_x2_min_32f_generic(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a < b ? a : b);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_ORC
+
+extern void
+volk_32f_x2_min_32f_a_orc_impl(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points);
+
+static inline void
+volk_32f_x2_min_32f_u_orc(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ volk_32f_x2_min_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_min_32f_a_H */
+
+
+#ifndef INCLUDED_volk_32f_x2_min_32f_u_H
+#define INCLUDED_volk_32f_x2_min_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_x2_min_32f_u_avx(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eigthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m256 aVal, bVal, cVal;
+ for(;number < eigthPoints; number++){
+ aVal = _mm256_loadu_ps(aPtr);
+ bVal = _mm256_loadu_ps(bPtr);
+
+ cVal = _mm256_min_ps(aVal, bVal);
+
+ _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eigthPoints * 8;
+ for(;number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a < b ? a : b);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_32f_x2_min_32f_u_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_x2_multiply_32f
+ *
+ * \b Overview
+ *
+ * Multiplies two input floating point vectors together.
+ *
+ * c[i] = a[i] * b[i]
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_x2_multiply_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: First input vector.
+ * \li bVector: Second input vector.
+ * \li num_points: The number of values in both input vectors.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * Multiply elements of an increasing vector by those of a decreasing vector.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* decreasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * increasing[ii] = (float)ii;
+ * decreasing[ii] = 10.f - (float)ii;
+ * }
+ *
+ * volk_32f_x2_multiply_32f(out, increasing, decreasing, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out[%u] = %1.2f\n", ii, out[ii]);
+ * }
+ *
+ * volk_free(increasing);
+ * volk_free(decreasing);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H
+#define INCLUDED_volk_32f_x2_multiply_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_loadu_ps(aPtr);
+ bVal = _mm_loadu_ps(bPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
+ _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m256 aVal, bVal, cVal;
+ for(;number < eighthPoints; number++){
+
+ aVal = _mm256_loadu_ps(aPtr);
+ bVal = _mm256_loadu_ps(bPtr);
+
+ cVal = _mm256_mul_ps(aVal, bVal);
+
+ _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_x2_multiply_32f_generic(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_x2_multiply_32f_u_H */
+
+
+#ifndef INCLUDED_volk_32f_x2_multiply_32f_a_H
+#define INCLUDED_volk_32f_x2_multiply_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_x2_multiply_32f_a_sse(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_x2_multiply_32f_a_avx(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m256 aVal, bVal, cVal;
+ for(;number < eighthPoints; number++){
+
+ aVal = _mm256_load_ps(aPtr);
+ bVal = _mm256_load_ps(bPtr);
+
+ cVal = _mm256_mul_ps(aVal, bVal);
+
+ _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32f_x2_multiply_32f_neon(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ const unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+ float32x4_t avec, bvec, cvec;
+ for(number=0; number < quarter_points; ++number) {
+ avec = vld1q_f32(aVector);
+ bvec = vld1q_f32(bVector);
+ cvec = vmulq_f32(avec, bvec);
+ vst1q_f32(cVector, cvec);
+ aVector += 4;
+ bVector += 4;
+ cVector += 4;
+ }
+ for(number=quarter_points*4; number < num_points; ++number) {
+ *cVector++ = *aVector++ * *bVector++;
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_x2_multiply_32f_a_generic(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_ORC
+extern void
+volk_32f_x2_multiply_32f_a_orc_impl(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points);
+
+static inline void
+volk_32f_x2_multiply_32f_u_orc(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_multiply_32f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_x2_pow_32f
+ *
+ * \b Overview
+ *
+ * Raises the sample in aVector to the power of the number in bVector.
+ *
+ * c[i] = pow(a[i], b[i])
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_x2_pow_32f(float* cVector, const float* bVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li bVector: The input vector of indices (power values).
+ * \li aVector: The input vector of base values.
+ * \li num_points: The number of values in both input vectors.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * Calculate the first two powers of two (2^x).
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* twos = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * increasing[ii] = (float)ii;
+ * twos[ii] = 2.f;
+ * }
+ *
+ * volk_32f_x2_pow_32f(out, increasing, twos, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out[%u] = %1.2f\n", ii, out[ii]);
+ * }
+ *
+ * volk_free(increasing);
+ * volk_free(twos);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
+#define INCLUDED_volk_32f_x2_pow_32f_a_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <math.h>
+
+#define POLY0(x, c0) _mm_set1_ps(c0)
+#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+#define POW_POLY_DEGREE 3
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float* bVector,
+ const float* aVector, unsigned int num_points)
+{
+ float* cPtr = cVector;
+ const float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+ __m128 tmp, fx, mask, pow2n, z, y;
+ __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+ __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+ __m128i bias, exp, emm0, pi32_0x7f;
+
+ one = _mm_set1_ps(1.0);
+ exp_hi = _mm_set1_ps(88.3762626647949);
+ exp_lo = _mm_set1_ps(-88.3762626647949);
+ ln2 = _mm_set1_ps(0.6931471805);
+ log2EF = _mm_set1_ps(1.44269504088896341);
+ half = _mm_set1_ps(0.5);
+ exp_C1 = _mm_set1_ps(0.693359375);
+ exp_C2 = _mm_set1_ps(-2.12194440e-4);
+ pi32_0x7f = _mm_set1_epi32(0x7f);
+
+ exp_p0 = _mm_set1_ps(1.9875691500e-4);
+ exp_p1 = _mm_set1_ps(1.3981999507e-3);
+ exp_p2 = _mm_set1_ps(8.3334519073e-3);
+ exp_p3 = _mm_set1_ps(4.1665795894e-2);
+ exp_p4 = _mm_set1_ps(1.6666665459e-1);
+ exp_p5 = _mm_set1_ps(5.0000001201e-1);
+
+ for(;number < quarterPoints; number++){
+ // First compute the logarithm
+ aVal = _mm_load_ps(aPtr);
+ bias = _mm_set1_epi32(127);
+ leadingOne = _mm_set1_ps(1.0f);
+ exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
+ logarithm = _mm_cvtepi32_ps(exp);
+
+ frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+
+#if POW_POLY_DEGREE == 6
+ mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+#elif POW_POLY_DEGREE == 5
+ mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+#elif POW_POLY_DEGREE == 4
+ mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+#elif POW_POLY_DEGREE == 3
+ mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+#else
+#error
+#endif
+
+ logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
+ logarithm = _mm_mul_ps(logarithm, ln2);
+
+
+ // Now calculate b*lna
+ bVal = _mm_load_ps(bPtr);
+ bVal = _mm_mul_ps(bVal, logarithm);
+
+ // Now compute exp(b*lna)
+ tmp = _mm_setzero_ps();
+
+ bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
+
+ fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
+
+ emm0 = _mm_cvttps_epi32(fx);
+ tmp = _mm_cvtepi32_ps(emm0);
+
+ mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
+ fx = _mm_sub_ps(tmp, mask);
+
+ tmp = _mm_mul_ps(fx, exp_C1);
+ z = _mm_mul_ps(fx, exp_C2);
+ bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
+ z = _mm_mul_ps(bVal, bVal);
+
+ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
+ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
+ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
+ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
+ y = _mm_add_ps(y, one);
+
+ emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
+
+ pow2n = _mm_castsi128_ps(emm0);
+ cVal = _mm_mul_ps(y, pow2n);
+
+ _mm_store_ps(cPtr, cVal);
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = powf(*aPtr++, *bPtr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+#endif /* INCLUDED_volk_32f_x2_pow_32f_a_H */
+
+
+
+#ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
+#define INCLUDED_volk_32f_x2_pow_32f_u_H
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_x2_pow_32f_generic(float* cVector, const float* bVector,
+ const float* aVector, unsigned int num_points)
+{
+ float* cPtr = cVector;
+ const float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = powf(*aPtr++, *bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_x2_pow_32f_u_sse4_1(float* cVector, const float* bVector,
+ const float* aVector, unsigned int num_points)
+{
+ float* cPtr = cVector;
+ const float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+ __m128 tmp, fx, mask, pow2n, z, y;
+ __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+ __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+ __m128i bias, exp, emm0, pi32_0x7f;
+
+ one = _mm_set1_ps(1.0);
+ exp_hi = _mm_set1_ps(88.3762626647949);
+ exp_lo = _mm_set1_ps(-88.3762626647949);
+ ln2 = _mm_set1_ps(0.6931471805);
+ log2EF = _mm_set1_ps(1.44269504088896341);
+ half = _mm_set1_ps(0.5);
+ exp_C1 = _mm_set1_ps(0.693359375);
+ exp_C2 = _mm_set1_ps(-2.12194440e-4);
+ pi32_0x7f = _mm_set1_epi32(0x7f);
+
+ exp_p0 = _mm_set1_ps(1.9875691500e-4);
+ exp_p1 = _mm_set1_ps(1.3981999507e-3);
+ exp_p2 = _mm_set1_ps(8.3334519073e-3);
+ exp_p3 = _mm_set1_ps(4.1665795894e-2);
+ exp_p4 = _mm_set1_ps(1.6666665459e-1);
+ exp_p5 = _mm_set1_ps(5.0000001201e-1);
+
+ for(;number < quarterPoints; number++){
+ // First compute the logarithm
+ aVal = _mm_loadu_ps(aPtr);
+ bias = _mm_set1_epi32(127);
+ leadingOne = _mm_set1_ps(1.0f);
+ exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
+ logarithm = _mm_cvtepi32_ps(exp);
+
+ frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+
+#if POW_POLY_DEGREE == 6
+ mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+#elif POW_POLY_DEGREE == 5
+ mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+#elif POW_POLY_DEGREE == 4
+ mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+#elif POW_POLY_DEGREE == 3
+ mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+#else
+#error
+#endif
+
+ logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
+ logarithm = _mm_mul_ps(logarithm, ln2);
+
+
+ // Now calculate b*lna
+ bVal = _mm_loadu_ps(bPtr);
+ bVal = _mm_mul_ps(bVal, logarithm);
+
+ // Now compute exp(b*lna)
+ tmp = _mm_setzero_ps();
+
+ bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
+
+ fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
+
+ emm0 = _mm_cvttps_epi32(fx);
+ tmp = _mm_cvtepi32_ps(emm0);
+
+ mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
+ fx = _mm_sub_ps(tmp, mask);
+
+ tmp = _mm_mul_ps(fx, exp_C1);
+ z = _mm_mul_ps(fx, exp_C2);
+ bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
+ z = _mm_mul_ps(bVal, bVal);
+
+ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
+ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
+ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
+ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
+ y = _mm_add_ps(y, one);
+
+ emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
+
+ pow2n = _mm_castsi128_ps(emm0);
+ cVal = _mm_mul_ps(y, pow2n);
+
+ _mm_storeu_ps(cPtr, cVal);
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = powf(*aPtr++, *bPtr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE4_1 for unaligned */
+
+#endif /* INCLUDED_volk_32f_x2_log2_32f_u_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_x2_s32f_interleave_16ic
+ *
+ * \b Overview
+ *
+ * Takes input vector iBuffer as the real (inphase) part and input
+ * vector qBuffer as the imag (quadrature) part and combines them into
+ * a complex output vector. The output is scaled by the input scalar
+ * value and convert to a 16-bit short comlex number.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_x2_s32f_interleave_16ic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li iBuffer: Input vector of samples for the real part.
+ * \li qBuffer: Input vector of samples for the imaginary part.
+ * \;i scalar: The scalar value used to scale the values before converting to shorts.
+ * \li num_points: The number of values in both input vectors.
+ *
+ * \b Outputs
+ * \li complexVector: The output vector of complex numbers.
+ *
+ * \b Example
+ * Generate points around the unit circle and convert to complex integers.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* imag = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* real = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * lv_16sc_t* out = (lv_16sc_t*)volk_malloc(sizeof(lv_16sc_t)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * real[ii] = 2.f * ((float)ii / (float)N) - 1.f;
+ * imag[ii] = std::sqrt(1.f - real[ii] * real[ii]);
+ * }
+ * // Normalize by smallest delta (0.02 in this example)
+ * float scale = 50.f;
+ *
+ * volk_32f_x2_s32f_interleave_16ic(out, imag, real, scale, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out[%u] = %i + %ij\n", ii, std::real(out[ii]), std::imag(out[ii]));
+ * }
+ *
+ * volk_free(imag);
+ * volk_free(real);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
+#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void
+volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector, const float* iBuffer,
+ const float* qBuffer, const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 iValue, qValue, cplxValue1, cplxValue2;
+ __m128i intValue1, intValue2;
+
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+ for(;number < quarterPoints; number++){
+ iValue = _mm_load_ps(iBufferPtr);
+ qValue = _mm_load_ps(qBufferPtr);
+
+ // Interleaves the lower two values in the i and q variables into one buffer
+ cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
+ cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
+
+ // Interleaves the upper two values in the i and q variables into one buffer
+ cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
+ cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
+
+ intValue1 = _mm_cvtps_epi32(cplxValue1);
+ intValue2 = _mm_cvtps_epi32(cplxValue2);
+
+ intValue1 = _mm_packs_epi32(intValue1, intValue2);
+
+ _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
+ complexVectorPtr += 8;
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ complexVectorPtr = (int16_t*)(&complexVector[number]);
+ for(; number < num_points; number++){
+ *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+ *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVector, const float* iBuffer,
+ const float* qBuffer, const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 iValue, qValue, cplxValue;
+
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ iValue = _mm_load_ps(iBufferPtr);
+ qValue = _mm_load_ps(qBufferPtr);
+
+ // Interleaves the lower two values in the i and q variables into one buffer
+ cplxValue = _mm_unpacklo_ps(iValue, qValue);
+ cplxValue = _mm_mul_ps(cplxValue, vScalar);
+
+ _mm_store_ps(floatBuffer, cplxValue);
+
+ *complexVectorPtr++ = (int16_t)(floatBuffer[0]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[1]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[2]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[3]);
+
+ // Interleaves the upper two values in the i and q variables into one buffer
+ cplxValue = _mm_unpackhi_ps(iValue, qValue);
+ cplxValue = _mm_mul_ps(cplxValue, vScalar);
+
+ _mm_store_ps(floatBuffer, cplxValue);
+
+ *complexVectorPtr++ = (int16_t)(floatBuffer[0]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[1]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[2]);
+ *complexVectorPtr++ = (int16_t)(floatBuffer[3]);
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ complexVectorPtr = (int16_t*)(&complexVector[number]);
+ for(; number < num_points; number++){
+ *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+ *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector, const float* iBuffer,
+ const float* qBuffer, const float scalar, unsigned int num_points)
+{
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+ *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H */
+
+#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
+#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void
+volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVector, const float* iBuffer,
+ const float* qBuffer, const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const float* iBufferPtr = iBuffer;
+ const float* qBufferPtr = qBuffer;
+
+ __m256 vScalar = _mm256_set1_ps(scalar);
+
+ const unsigned int eighthPoints = num_points / 8;
+
+ __m256 iValue, qValue, cplxValue1, cplxValue2;
+ __m256i intValue1, intValue2;
+
+ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+ for(;number < eighthPoints; number++){
+ iValue = _mm256_loadu_ps(iBufferPtr);
+ qValue = _mm256_loadu_ps(qBufferPtr);
+
+ // Interleaves the lower two values in the i and q variables into one buffer
+ cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
+ cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
+
+ // Interleaves the upper two values in the i and q variables into one buffer
+ cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
+ cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
+
+ intValue1 = _mm256_cvtps_epi32(cplxValue1);
+ intValue2 = _mm256_cvtps_epi32(cplxValue2);
+
+ intValue1 = _mm256_packs_epi32(intValue1, intValue2);
+
+ _mm256_storeu_si256((__m256i*)complexVectorPtr, intValue1);
+ complexVectorPtr += 16;
+
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ complexVectorPtr = (int16_t*)(&complexVector[number]);
+ for(; number < num_points; number++){
+ *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+ *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+ }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_x2_subtract_32f
+ *
+ * \b Overview
+ *
+ * Subtracts values in bVector from values in aVector.
+ *
+ * c[i] = a[i] - b[i]
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_x2_subtract_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The initial vector.
+ * \li bVector: The vector to be subtracted.
+ * \li num_points: The number of values in both input vectors.
+ *
+ * \b Outputs
+ * \li complexVector: The output vector.
+ *
+ * \b Example
+ * Subtract and increasing vector from a decreasing vector.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* decreasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * increasing[ii] = (float)ii;
+ * decreasing[ii] = 10.f - (float)ii;
+ * }
+ *
+ * volk_32f_x2_subtract_32f(out, increasing, decreasing, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out[%u] = %1.2f\n", ii, out[ii]);
+ * }
+ *
+ * volk_free(increasing);
+ * volk_free(decreasing);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_x2_subtract_32f_a_H
+#define INCLUDED_volk_32f_x2_subtract_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_x2_subtract_32f_a_sse(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_sub_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) - (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_x2_subtract_32f_generic(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) - (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32f_x2_subtract_32f_neon(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+
+ float32x4_t a_vec, b_vec, c_vec;
+
+ for(number = 0; number < quarter_points; number++){
+ a_vec = vld1q_f32(aPtr);
+ b_vec = vld1q_f32(bPtr);
+ c_vec = vsubq_f32(a_vec, b_vec);
+ vst1q_f32(cPtr, c_vec);
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ for(number = quarter_points * 4; number < num_points; number++){
+ *cPtr++ = (*aPtr++) - (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_ORC
+extern void
+volk_32f_x2_subtract_32f_a_orc_impl(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points);
+
+static inline void
+volk_32f_x2_subtract_32f_u_orc(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_subtract_32f_a_H */
+
+
+#ifndef INCLUDED_volk_32f_x2_subtract_32f_u_H
+#define INCLUDED_volk_32f_x2_subtract_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_x2_subtract_32f_u_avx(float* cVector, const float* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr = bVector;
+
+ __m256 aVal, bVal, cVal;
+ for(;number < eighthPoints; number++){
+
+ aVal = _mm256_loadu_ps(aPtr);
+ bVal = _mm256_loadu_ps(bPtr);
+
+ cVal = _mm256_sub_ps(aVal, bVal);
+
+ _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) - (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_32f_x2_subtract_32f_u_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_x3_sum_of_poly_32f
+ *
+ * \b Overview
+ *
+ * Calculates the unscaled area under a fourth order polynomial using the
+ * rectangular method. The result is the sum of y-values. To get the area,
+ * multiply by the rectangle/bin width.
+ *
+ * Expressed as a formula, this function calculates
+ * \f$ \sum f(x) = \sum (c_0 + c_1 \cdot x + c_2 \cdot x^2 + c_3 \cdot x^3 + c_4 \cdot x^4)\f$
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_x3_sum_of_poly_32f(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li src0: x values
+ * \li center_point_array: polynomial coefficients in order {c1, c2, c3, c4, c0}
+ * \li cutoff: the minimum x value to use (will clamp to cutoff if input < cutoff)
+ * \li num_points: The number of values in both input vectors.
+ *
+ * \b Outputs
+ * \li complexVector: The sum of y values generated by polynomial.
+ *
+ * \b Example
+ * The following estimates \f$\int_0^\pi e^x dx\f$ by using the Taylor expansion
+ * centered around \f$x=1.5\f$,
+ * \f$ e^x = e^1.5 * (1 + x + \frac{1}{2} x^2 + \frac{1}{6}x^3 + \frac{1}{24}x^4) \f$
+ * \code
+ * int npoints = 4096;
+ * float* coefficients = (float*)volk_malloc(sizeof(float) * 5, volk_get_alignment());
+ * float* input = (float*)volk_malloc(sizeof(float) * npoints, volk_get_alignment());
+ * float* result = (float*)volk_malloc(sizeof(float), volk_get_alignment());
+ * float* cutoff = (float*)volk_malloc(sizeof(float), volk_get_alignment());
+ * // load precomputed Taylor series coefficients
+ * coefficients[0] = 4.48168907033806f; // c1
+ * coefficients[1] = coefficients[0] * 0.5f; // c2
+ * coefficients[2] = coefficients[0] * 1.0f/6.0f; // c3
+ * coefficients[3] = coefficients[0] * 1.0f/24.0f; // c4
+ * coefficients[4] = coefficients[0]; // c0
+ * *cutoff = -2.0;
+ * *result = 0.0f;
+ * // generate uniform input data
+ * float dx = (float)M_PI/ (float)npoints;
+ * for(unsigned int ii=0; ii < npoints; ++ii){
+ * input[ii] = dx * (float)ii - 1.5f;
+ * }
+ * volk_32f_x3_sum_of_poly_32f(result, input, coefficients, cutoff, npoints);
+ * // multiply by bin width to get approximate area
+ * std::cout << "result is " << *result * (input[1]-input[0]) << std::endl;
+ * volk_free(coefficients);
+ * volk_free(input);
+ * volk_free(result);
+ * volk_free(cutoff);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
+#define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
+
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+
+#ifndef MAX
+#define MAX(X,Y) ((X) > (Y)?(X):(Y))
+#endif
+
+#ifdef LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+static inline void
+volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array,
+ float* cutoff, unsigned int num_points)
+{
+ const unsigned int num_bytes = num_points*4;
+
+ float result = 0.0;
+ float fst = 0.0;
+ float sq = 0.0;
+ float thrd = 0.0;
+ float frth = 0.0;
+ //float fith = 0.0;
+
+ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12;
+
+ xmm9 = _mm_setzero_ps();
+ xmm1 = _mm_setzero_ps();
+
+ xmm0 = _mm_load1_ps(¢er_point_array[0]);
+ xmm6 = _mm_load1_ps(¢er_point_array[1]);
+ xmm7 = _mm_load1_ps(¢er_point_array[2]);
+ xmm8 = _mm_load1_ps(¢er_point_array[3]);
+ //xmm11 = _mm_load1_ps(¢er_point_array[4]);
+ xmm10 = _mm_load1_ps(cutoff);
+
+ int bound = num_bytes >> 4;
+ int leftovers = (num_bytes >> 2) & 3;
+ int i = 0;
+
+ for(; i < bound; ++i) {
+ xmm2 = _mm_load_ps(src0);
+ xmm2 = _mm_max_ps(xmm10, xmm2);
+ xmm3 = _mm_mul_ps(xmm2, xmm2);
+ xmm4 = _mm_mul_ps(xmm2, xmm3);
+ xmm5 = _mm_mul_ps(xmm3, xmm3);
+ //xmm12 = _mm_mul_ps(xmm3, xmm4);
+
+ xmm2 = _mm_mul_ps(xmm2, xmm0);
+ xmm3 = _mm_mul_ps(xmm3, xmm6);
+ xmm4 = _mm_mul_ps(xmm4, xmm7);
+ xmm5 = _mm_mul_ps(xmm5, xmm8);
+ //xmm12 = _mm_mul_ps(xmm12, xmm11);
+
+ xmm2 = _mm_add_ps(xmm2, xmm3);
+ xmm3 = _mm_add_ps(xmm4, xmm5);
+
+ src0 += 4;
+
+ xmm9 = _mm_add_ps(xmm2, xmm9);
+
+ xmm1 = _mm_add_ps(xmm3, xmm1);
+
+ //xmm9 = _mm_add_ps(xmm12, xmm9);
+ }
+
+ xmm2 = _mm_hadd_ps(xmm9, xmm1);
+ xmm3 = _mm_hadd_ps(xmm2, xmm2);
+ xmm4 = _mm_hadd_ps(xmm3, xmm3);
+
+ _mm_store_ss(&result, xmm4);
+
+
+
+ for(i = 0; i < leftovers; ++i) {
+ fst = src0[i];
+ fst = MAX(fst, *cutoff);
+ sq = fst * fst;
+ thrd = fst * sq;
+ frth = sq * sq;
+ //fith = sq * thrd;
+
+ result += (center_point_array[0] * fst +
+ center_point_array[1] * sq +
+ center_point_array[2] * thrd +
+ center_point_array[3] * frth);// +
+ //center_point_array[4] * fith);
+ }
+
+ result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5];
+
+ target[0] = result;
+}
+
+
+#endif /*LV_HAVE_SSE3*/
+
+
+#ifdef LV_HAVE_AVX
+#include<immintrin.h>
+
+static inline void
+volk_32f_x3_sum_of_poly_32f_a_avx(float* target, float* src0, float* center_point_array,
+ float* cutoff, unsigned int num_points)
+{
+ const unsigned int eighth_points = num_points / 8;
+ float fst = 0.0;
+ float sq = 0.0;
+ float thrd = 0.0;
+ float frth = 0.0;
+
+ __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
+ __m256 target_vec;
+ __m256 x_to_1, x_to_2, x_to_3, x_to_4;
+
+ cpa0 = _mm256_set1_ps(center_point_array[0]);
+ cpa1 = _mm256_set1_ps(center_point_array[1]);
+ cpa2 = _mm256_set1_ps(center_point_array[2]);
+ cpa3 = _mm256_set1_ps(center_point_array[3]);
+ cutoff_vec = _mm256_set1_ps(*cutoff);
+ target_vec = _mm256_setzero_ps();
+
+ unsigned int i;
+
+ for(i = 0; i < eighth_points; ++i) {
+ x_to_1 = _mm256_load_ps(src0);
+ x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
+ x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
+ x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
+ // x^1 * x^3 is slightly faster than x^2 * x^2
+ x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
+
+ x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
+ x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
+ x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
+ x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
+
+ x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
+ x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
+ // this is slightly faster than result += (x_to_1 + x_to_3)
+ target_vec = _mm256_add_ps(x_to_1, target_vec);
+ target_vec = _mm256_add_ps(x_to_3, target_vec);
+
+ src0 += 8;
+ }
+
+ // the hadd for vector reduction has very very slight impact @ 50k iters
+ __VOLK_ATTR_ALIGNED(32) float temp_results[8];
+ target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
+ _mm256_store_ps(temp_results, target_vec);
+ *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
+
+
+ for(i = eighth_points*8; i < num_points; ++i) {
+ fst = *(src0++);
+ fst = MAX(fst, *cutoff);
+ sq = fst * fst;
+ thrd = fst * sq;
+ frth = sq * sq;
+
+ *target += (center_point_array[0] * fst +
+ center_point_array[1] * sq +
+ center_point_array[2] * thrd +
+ center_point_array[3] * frth);
+ }
+
+ *target += ((float)(num_points)) * center_point_array[4];
+}
+#endif // LV_HAVE_AVX
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src0, float* center_point_array,
+ float* cutoff, unsigned int num_points)
+{
+ const unsigned int num_bytes = num_points*4;
+
+ float result = 0.0;
+ float fst = 0.0;
+ float sq = 0.0;
+ float thrd = 0.0;
+ float frth = 0.0;
+ //float fith = 0.0;
+
+ unsigned int i = 0;
+
+ for(; i < num_bytes >> 2; ++i) {
+ fst = src0[i];
+ fst = MAX(fst, *cutoff);
+
+ sq = fst * fst;
+ thrd = fst * sq;
+ frth = sq * sq;
+ //fith = sq * thrd;
+
+ result += (center_point_array[0] * fst +
+ center_point_array[1] * sq +
+ center_point_array[2] * thrd +
+ center_point_array[3] * frth); //+
+ //center_point_array[4] * fith);
+ /*printf("%f12...%d\n", (center_point_array[0] * fst +
+ center_point_array[1] * sq +
+ center_point_array[2] * thrd +
+ center_point_array[3] * frth) +
+ //center_point_array[4] * fith) +
+ (center_point_array[4]), i);
+ */
+ }
+
+ result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]);
+
+ *target = result;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_AVX
+#include<immintrin.h>
+
+static inline void
+volk_32f_x3_sum_of_poly_32f_u_avx(float* target, float* src0, float* center_point_array,
+ float* cutoff, unsigned int num_points)
+{
+ const unsigned int eighth_points = num_points / 8;
+ float fst = 0.0;
+ float sq = 0.0;
+ float thrd = 0.0;
+ float frth = 0.0;
+
+ __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
+ __m256 target_vec;
+ __m256 x_to_1, x_to_2, x_to_3, x_to_4;
+
+ cpa0 = _mm256_set1_ps(center_point_array[0]);
+ cpa1 = _mm256_set1_ps(center_point_array[1]);
+ cpa2 = _mm256_set1_ps(center_point_array[2]);
+ cpa3 = _mm256_set1_ps(center_point_array[3]);
+ cutoff_vec = _mm256_set1_ps(*cutoff);
+ target_vec = _mm256_setzero_ps();
+
+ unsigned int i;
+
+ for(i = 0; i < eighth_points; ++i) {
+ x_to_1 = _mm256_loadu_ps(src0);
+ x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
+ x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
+ x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
+ // x^1 * x^3 is slightly faster than x^2 * x^2
+ x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
+
+ x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
+ x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
+ x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
+ x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
+
+ x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
+ x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
+ // this is slightly faster than result += (x_to_1 + x_to_3)
+ target_vec = _mm256_add_ps(x_to_1, target_vec);
+ target_vec = _mm256_add_ps(x_to_3, target_vec);
+
+ src0 += 8;
+ }
+
+ // the hadd for vector reduction has very very slight impact @ 50k iters
+ __VOLK_ATTR_ALIGNED(32) float temp_results[8];
+ target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
+ _mm256_store_ps(temp_results, target_vec);
+ *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
+
+
+ for(i = eighth_points*8; i < num_points; ++i) {
+ fst = *(src0++);
+ fst = MAX(fst, *cutoff);
+ sq = fst * fst;
+ thrd = fst * sq;
+ frth = sq * sq;
+
+ *target += (center_point_array[0] * fst +
+ center_point_array[1] * sq +
+ center_point_array[2] * thrd +
+ center_point_array[3] * frth);
+ }
+
+ *target += ((float)(num_points)) * center_point_array[4];
+}
+#endif // LV_HAVE_AVX
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target, float* __restrict src0,
+ float* __restrict center_point_array,
+ float* __restrict cutoff, unsigned int num_points)
+{
+ unsigned int i;
+ float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
+
+ float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
+ float32x2_t cutoff_vector;
+ float32x2x2_t x_low, x_high;
+ float32x4_t x_qvector, c_qvector, cpa_qvector;
+ float accumulator;
+ float res_accumulators[4];
+
+ c_qvector = vld1q_f32( zero );
+ // load the cutoff in to a vector
+ cutoff_vector = vdup_n_f32( *cutoff );
+ // ... center point array
+ cpa_qvector = vld1q_f32( center_point_array );
+
+ for(i=0; i < num_points; ++i) {
+ // load x (src0)
+ x_to_1 = vdup_n_f32( *src0++ );
+
+ // Get a vector of max(src0, cutoff)
+ x_to_1 = vmax_f32(x_to_1, cutoff_vector ); // x^1
+ x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2
+ x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3
+ x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4
+ // zip up doubles to interleave
+ x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1]
+ x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3]
+ // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
+ x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
+ // now we finally have [x^4 | x^3 | x^2 | x] !
+
+ c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
+
+ }
+ // there should be better vector reduction techniques
+ vst1q_f32(res_accumulators, c_qvector );
+ accumulator = res_accumulators[0] + res_accumulators[1] +
+ res_accumulators[2] + res_accumulators[3];
+
+ *target = accumulator + center_point_array[4] * (float)num_points;
+}
+
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_NEON
+
+static inline void
+volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target, float* __restrict src0,
+ float* __restrict center_point_array,
+ float* __restrict cutoff, unsigned int num_points)
+{
+ unsigned int i;
+ float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
+
+ float accumulator;
+
+ float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
+ accumulator1_vec = vld1q_f32(zero);
+ accumulator2_vec = vld1q_f32(zero);
+ accumulator3_vec = vld1q_f32(zero);
+ accumulator4_vec = vld1q_f32(zero);
+ float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
+ float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
+
+ // load the cutoff in to a vector
+ cutoff_vector = vdupq_n_f32( *cutoff );
+ // ... center point array
+ cpa_0 = vdupq_n_f32(center_point_array[0]);
+ cpa_1 = vdupq_n_f32(center_point_array[1]);
+ cpa_2 = vdupq_n_f32(center_point_array[2]);
+ cpa_3 = vdupq_n_f32(center_point_array[3]);
+
+ // nathan is not sure why this is slower *and* wrong compared to neonvertfma
+ for(i=0; i < num_points/4; ++i) {
+ // load x
+ x_to_1 = vld1q_f32( src0 );
+
+ // Get a vector of max(src0, cutoff)
+ x_to_1 = vmaxq_f32(x_to_1, cutoff_vector ); // x^1
+ x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2
+ x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3
+ x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4
+ x_to_1 = vmulq_f32(x_to_1, cpa_0);
+ x_to_2 = vmulq_f32(x_to_2, cpa_1);
+ x_to_3 = vmulq_f32(x_to_3, cpa_2);
+ x_to_4 = vmulq_f32(x_to_4, cpa_3);
+ accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
+ accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
+ accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
+ accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
+
+ src0 += 4;
+ }
+ accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
+ accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
+ accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
+
+ __VOLK_ATTR_ALIGNED(32) float res_accumulators[4];
+ vst1q_f32(res_accumulators, accumulator1_vec );
+ accumulator = res_accumulators[0] + res_accumulators[1] +
+ res_accumulators[2] + res_accumulators[3];
+
+ float fst = 0.0;
+ float sq = 0.0;
+ float thrd = 0.0;
+ float frth = 0.0;
+
+ for(i = 4*num_points/4; i < num_points; ++i) {
+ fst = src0[i];
+ fst = MAX(fst, *cutoff);
+
+ sq = fst * fst;
+ thrd = fst * sq;
+ frth = sq * sq;
+ //fith = sq * thrd;
+
+ accumulator += (center_point_array[0] * fst +
+ center_point_array[1] * sq +
+ center_point_array[2] * thrd +
+ center_point_array[3] * frth); //+
+ }
+
+ *target = accumulator + center_point_array[4] * (float)num_points;
+}
+
+#endif /* LV_HAVE_NEON */
+
+#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_32f_dot_prod_32fc
+ *
+ * \b Overview
+ *
+ * This block computes the dot product (or inner product) between two
+ * vectors, the \p input and \p taps vectors. Given a set of \p
+ * num_points taps, the result is the sum of products between the two
+ * vectors. The result is a single value stored in the \p result
+ * address and will be complex.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_32f_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li input: vector of complex samples
+ * \li taps: floating point taps
+ * \li num_points: number of samples in both \p input and \p taps
+ *
+ * \b Outputs
+ * \li result: pointer to a complex value to hold the dot product result.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ * lv_32fc_t y;
+ * lv_32fc_t *x = (lv_32fc_t*)volk_malloc(N*sizeof(lv_32fc_t), volk_get_alignment());
+ * float *t = (float*)volk_malloc(N*sizeof(float), volk_get_alignment());
+ *
+ * <populate x and t with some values>
+ *
+ * volk_32fc_dot_prod_32fc(&y, x, t, N);
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
+#define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
+
+#include <volk/volk_common.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) {
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* aPtr = (float*)input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ *realpt = 0;
+ *imagpt = 0;
+
+ for(number = 0; number < num_points; number++){
+ *realpt += ((*aPtr++) * (*bPtr));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32fc_32f_dot_prod_32fc_a_avx( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* aPtr = (float*)input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val, a2Val, a3Val;
+ __m256 b0Val, b1Val, b2Val, b3Val;
+ __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
+ __m256 c0Val, c1Val, c2Val, c3Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+ __m256 dotProdVal2 = _mm256_setzero_ps();
+ __m256 dotProdVal3 = _mm256_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm256_load_ps(aPtr);
+ a1Val = _mm256_load_ps(aPtr+8);
+ a2Val = _mm256_load_ps(aPtr+16);
+ a3Val = _mm256_load_ps(aPtr+24);
+
+ x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
+ x1Val = _mm256_load_ps(bPtr+8);
+ x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
+ x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
+ x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
+ x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
+
+ // TODO: it may be possible to rearrange swizzling to better pipeline data
+ b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
+ b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
+ b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
+ b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
+
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
+ c2Val = _mm256_mul_ps(a2Val, b2Val);
+ c3Val = _mm256_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 32;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+ *realpt += dotProductVector[4];
+ *imagpt += dotProductVector[5];
+ *realpt += dotProductVector[6];
+ *imagpt += dotProductVector[7];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ *realpt += ((*aPtr++) * (*bPtr));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_AVX*/
+
+
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32fc_32f_dot_prod_32fc_a_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 8;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* aPtr = (float*)input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 x0Val, x1Val, x2Val, x3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+
+ x0Val = _mm_load_ps(bPtr);
+ x1Val = _mm_load_ps(bPtr);
+ x2Val = _mm_load_ps(bPtr+4);
+ x3Val = _mm_load_ps(bPtr+4);
+ b0Val = _mm_unpacklo_ps(x0Val, x1Val);
+ b1Val = _mm_unpackhi_ps(x0Val, x1Val);
+ b2Val = _mm_unpacklo_ps(x2Val, x3Val);
+ b3Val = _mm_unpackhi_ps(x2Val, x3Val);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 8;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+
+ number = sixteenthPoints*8;
+ for(;number < num_points; number++){
+ *realpt += ((*aPtr++) * (*bPtr));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32fc_32f_dot_prod_32fc_u_avx( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* aPtr = (float*)input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val, a2Val, a3Val;
+ __m256 b0Val, b1Val, b2Val, b3Val;
+ __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
+ __m256 c0Val, c1Val, c2Val, c3Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+ __m256 dotProdVal2 = _mm256_setzero_ps();
+ __m256 dotProdVal3 = _mm256_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm256_loadu_ps(aPtr);
+ a1Val = _mm256_loadu_ps(aPtr+8);
+ a2Val = _mm256_loadu_ps(aPtr+16);
+ a3Val = _mm256_loadu_ps(aPtr+24);
+
+ x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
+ x1Val = _mm256_loadu_ps(bPtr+8);
+ x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
+ x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
+ x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
+ x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
+
+ // TODO: it may be possible to rearrange swizzling to better pipeline data
+ b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
+ b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
+ b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
+ b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
+
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
+ c2Val = _mm256_mul_ps(a2Val, b2Val);
+ c3Val = _mm256_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 32;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+ *realpt += dotProductVector[4];
+ *imagpt += dotProductVector[5];
+ *realpt += dotProductVector[6];
+ *imagpt += dotProductVector[7];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ *realpt += ((*aPtr++) * (*bPtr));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+#endif /*LV_HAVE_AVX*/
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_32f_dot_prod_32fc_neon_unroll ( lv_32fc_t* __restrict result, const lv_32fc_t* __restrict input, const float* __restrict taps, unsigned int num_points) {
+
+ unsigned int number;
+ const unsigned int quarterPoints = num_points / 8;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* inputPtr = (float*)input;
+ const float* tapsPtr = taps;
+ float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
+ float accVector_real[4];
+ float accVector_imag[4];
+
+ float32x4x2_t inputVector0, inputVector1;
+ float32x4_t tapsVector0, tapsVector1;
+ float32x4_t tmp_real0, tmp_imag0;
+ float32x4_t tmp_real1, tmp_imag1;
+ float32x4_t real_accumulator0, imag_accumulator0;
+ float32x4_t real_accumulator1, imag_accumulator1;
+
+ // zero out accumulators
+ // take a *float, return float32x4_t
+ real_accumulator0 = vld1q_f32( zero );
+ imag_accumulator0 = vld1q_f32( zero );
+ real_accumulator1 = vld1q_f32( zero );
+ imag_accumulator1 = vld1q_f32( zero );
+
+ for(number=0 ;number < quarterPoints; number++){
+ // load doublewords and duplicate in to second lane
+ tapsVector0 = vld1q_f32(tapsPtr );
+ tapsVector1 = vld1q_f32(tapsPtr+4 );
+
+ // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
+ inputVector0 = vld2q_f32(inputPtr );
+ inputVector1 = vld2q_f32(inputPtr+8 );
+ // inputVector is now a struct of two vectors, 0th is real, 1st is imag
+
+ tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
+ tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
+
+ tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
+ tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
+
+ real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
+ imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
+
+ real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
+ imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
+
+ tapsPtr += 8;
+ inputPtr += 16;
+ }
+
+ real_accumulator0 = vaddq_f32( real_accumulator0, real_accumulator1);
+ imag_accumulator0 = vaddq_f32( imag_accumulator0, imag_accumulator1);
+ // void vst1q_f32( float32_t * ptr, float32x4_t val);
+ // store results back to a complex (array of 2 floats)
+ vst1q_f32(accVector_real, real_accumulator0);
+ vst1q_f32(accVector_imag, imag_accumulator0);
+ *realpt = accVector_real[0] + accVector_real[1] +
+ accVector_real[2] + accVector_real[3] ;
+
+ *imagpt = accVector_imag[0] + accVector_imag[1] +
+ accVector_imag[2] + accVector_imag[3] ;
+
+ // clean up the remainder
+ for(number=quarterPoints*8; number < num_points; number++){
+ *realpt += ((*inputPtr++) * (*tapsPtr));
+ *imagpt += ((*inputPtr++) * (*tapsPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_32f_dot_prod_32fc_a_neon ( lv_32fc_t* __restrict result, const lv_32fc_t* __restrict input, const float* __restrict taps, unsigned int num_points) {
+
+ unsigned int number;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* inputPtr = (float*)input;
+ const float* tapsPtr = taps;
+ float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
+ float accVector_real[4];
+ float accVector_imag[4];
+
+ float32x4x2_t inputVector;
+ float32x4_t tapsVector;
+ float32x4_t tmp_real, tmp_imag;
+ float32x4_t real_accumulator, imag_accumulator;
+
+
+ // zero out accumulators
+ // take a *float, return float32x4_t
+ real_accumulator = vld1q_f32( zero );
+ imag_accumulator = vld1q_f32( zero );
+
+ for(number=0 ;number < quarterPoints; number++){
+ // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) )
+ // load doublewords and duplicate in to second lane
+ tapsVector = vld1q_f32(tapsPtr );
+
+ // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
+ inputVector = vld2q_f32(inputPtr );
+
+ tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
+ tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
+
+ real_accumulator = vaddq_f32(real_accumulator, tmp_real);
+ imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
+
+
+ tapsPtr += 4;
+ inputPtr += 8;
+
+ }
+
+ // store results back to a complex (array of 2 floats)
+ vst1q_f32(accVector_real, real_accumulator);
+ vst1q_f32(accVector_imag, imag_accumulator);
+ *realpt = accVector_real[0] + accVector_real[1] +
+ accVector_real[2] + accVector_real[3] ;
+
+ *imagpt = accVector_imag[0] + accVector_imag[1] +
+ accVector_imag[2] + accVector_imag[3] ;
+
+ // clean up the remainder
+ for(number=quarterPoints*4; number < num_points; number++){
+ *realpt += ((*inputPtr++) * (*tapsPtr));
+ *imagpt += ((*inputPtr++) * (*tapsPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_NEON
+extern void volk_32fc_32f_dot_prod_32fc_a_neonasm ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points);
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_NEON
+extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points);
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_NEON
+extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points);
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_SSE
+
+static inline void volk_32fc_32f_dot_prod_32fc_u_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 8;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* aPtr = (float*)input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 x0Val, x1Val, x2Val, x3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_loadu_ps(aPtr);
+ a1Val = _mm_loadu_ps(aPtr+4);
+ a2Val = _mm_loadu_ps(aPtr+8);
+ a3Val = _mm_loadu_ps(aPtr+12);
+
+ x0Val = _mm_loadu_ps(bPtr);
+ x1Val = _mm_loadu_ps(bPtr);
+ x2Val = _mm_loadu_ps(bPtr+4);
+ x3Val = _mm_loadu_ps(bPtr+4);
+ b0Val = _mm_unpacklo_ps(x0Val, x1Val);
+ b1Val = _mm_unpackhi_ps(x0Val, x1Val);
+ b2Val = _mm_unpacklo_ps(x2Val, x3Val);
+ b3Val = _mm_unpackhi_ps(x2Val, x3Val);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 8;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+
+ number = sixteenthPoints*8;
+ for(;number < num_points; number++){
+ *realpt += ((*aPtr++) * (*bPtr));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+#endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_32f_multiply_32fc
+ *
+ * \b Overview
+ *
+ * Multiplies a complex vector by a floating point vector and returns
+ * the complex result.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_32f_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points);
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of complex floats.
+ * \li bVector: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li outputVector: The output vector complex floats.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_32fc_32f_multiply_32fc();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a_H
+#define INCLUDED_volk_32fc_32f_multiply_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2;
+
+ __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
+
+ for(;number < eighthPoints; number++){
+
+ aVal1 = _mm256_load_ps((float *)aPtr);
+ aPtr += 4;
+
+ aVal2 = _mm256_load_ps((float *)aPtr);
+ aPtr += 4;
+
+ bVal = _mm256_load_ps(bPtr); // b0|b1|b2|b3|b4|b5|b6|b7
+ bPtr += 8;
+
+ bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00); // b0|b1|b2|b3|b0|b1|b2|b3
+ bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11); // b4|b5|b6|b7|b4|b5|b6|b7
+
+ bVal1 = _mm256_permutevar_ps(bVal1, permute_mask); // b0|b0|b1|b1|b2|b2|b3|b3
+ bVal2 = _mm256_permutevar_ps(bVal2, permute_mask); // b4|b4|b5|b5|b6|b6|b7|b7
+
+ cVal1 = _mm256_mul_ps(aVal1, bVal1);
+ cVal2 = _mm256_mul_ps(aVal2, bVal2);
+
+ _mm256_store_ps((float*)cPtr,cVal1); // Store the results back into the C container
+ cPtr += 4;
+
+ _mm256_store_ps((float*)cPtr,cVal2); // Store the results back into the C container
+ cPtr += 4;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; ++number){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal1 = _mm_load_ps((const float*)aPtr);
+ aPtr += 2;
+
+ aVal2 = _mm_load_ps((const float*)aPtr);
+ aPtr += 2;
+
+ bVal = _mm_load_ps(bPtr);
+ bPtr += 4;
+
+ bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0));
+ bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2));
+
+ cVal = _mm_mul_ps(aVal1, bVal1);
+
+ _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container
+ cPtr += 2;
+
+ cVal = _mm_mul_ps(aVal2, bVal2);
+
+ _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container
+
+ cPtr += 2;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr);
+ bPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32fc_32f_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+
+ float32x4x2_t inputVector, outputVector;
+ float32x4_t tapsVector;
+ for(number = 0; number < quarter_points; number++){
+ inputVector = vld2q_f32((float*)aPtr);
+ tapsVector = vld1q_f32(bPtr);
+
+ outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector);
+ outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector);
+
+ vst2q_f32((float*)cPtr, outputVector);
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ for(number = quarter_points * 4; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_ORC
+
+extern void
+volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const float* bVector, unsigned int num_points);
+
+static inline void
+volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const float* bVector, unsigned int num_points)
+{
+ volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_conjugate_32fc
+ *
+ * \b Overview
+ *
+ * Takes the conjugate of a complex vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of complex floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li bVector: The output vector of complex floats.
+ *
+ * \b Example
+ * Generate points around the top half of the unit circle and conjugate them
+ * to give bottom half of the unit circle.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * float real = 2.f * ((float)ii / (float)N) - 1.f;
+ * float imag = std::sqrt(1.f - real * real);
+ * in[ii] = lv_cmake(real, imag);
+ * }
+ *
+ * volk_32fc_conjugate_32fc(out, in, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out(%i) = %.1f + %.1fi\n", ii, lv_creal(out[ii]), lv_cimag(out[ii]));
+ * }
+ *
+ * volk_free(in);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H
+#define INCLUDED_volk_32fc_conjugate_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32fc_conjugate_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m256 x;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+
+ for(;number < quarterPoints; number++){
+
+ x = _mm256_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+ x = _mm256_xor_ps(x, conjugator); // conjugate register
+
+ _mm256_storeu_ps((float*)c,x); // Store the results back into the C container
+
+ a += 4;
+ c += 4;
+ }
+
+ number = quarterPoints * 4;
+
+ for(;number < num_points; number++) {
+ *c++ = lv_conj(*a++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+static inline void
+volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+ x = _mm_xor_ps(x, conjugator); // conjugate register
+
+ _mm_storeu_ps((float*)c,x); // Store the results back into the C container
+
+ a += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = lv_conj(*a);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
+{
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = lv_conj(*aPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_conjugate_32fc_u_H */
+#ifndef INCLUDED_volk_32fc_conjugate_32fc_a_H
+#define INCLUDED_volk_32fc_conjugate_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32fc_conjugate_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m256 x;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+
+ for(;number < quarterPoints; number++){
+
+ x = _mm256_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+ x = _mm256_xor_ps(x, conjugator); // conjugate register
+
+ _mm256_store_ps((float*)c,x); // Store the results back into the C container
+
+ a += 4;
+ c += 4;
+ }
+
+ number = quarterPoints * 4;
+
+ for(;number < num_points; number++) {
+ *c++ = lv_conj(*a++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+static inline void
+volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+ x = _mm_xor_ps(x, conjugator); // conjugate register
+
+ _mm_store_ps((float*)c,x); // Store the results back into the C container
+
+ a += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = lv_conj(*a);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
+{
+ unsigned int number;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float32x4x2_t x;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ for(number=0; number < quarterPoints; number++){
+ __VOLK_PREFETCH(a+4);
+ x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di
+
+ // xor the imaginary lane
+ x.val[1] = vnegq_f32( x.val[1]);
+
+ vst2q_f32((float*)c,x); // Store the results back into the C container
+
+ a += 4;
+ c += 4;
+ }
+
+ for(number=quarterPoints*4; number < num_points; number++){
+ *c++ = lv_conj(*a++);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
+{
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = lv_conj(*aPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_conjugate_32fc_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2016 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See thegit
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_convert_16ic
+ *
+ * \b Overview
+ *
+ * Converts a complex vector of 32-bits float each component into
+ * a complex vector of 16-bits integer each component.
+ * Values are saturated to the limit values of the output data type.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_convert_16ic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points);
+ * \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The complex 32-bit float input data buffer.
+ * \li num_points: The number of data values to be converted.
+ *
+ * \b Outputs
+ * \li outputVector: The complex 16-bit integer output data buffer.
+ *
+ */
+
+#ifndef INCLUDED_volk_32fc_convert_16ic_H
+#define INCLUDED_volk_32fc_convert_16ic_H
+
+#include <limits.h>
+#include <math.h>
+#include "volk/volk_complex.h"
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 4;
+
+ float* inputVectorPtr = (float*)inputVector;
+ int16_t* outputVectorPtr = (int16_t*)outputVector;
+ float aux;
+
+ const float min_val = (float)SHRT_MIN;
+ const float max_val = (float)SHRT_MAX;
+
+ __m128 inputVal1, inputVal2;
+ __m128i intInputVal1, intInputVal2;
+ __m128 ret1, ret2;
+ const __m128 vmin_val = _mm_set_ps1(min_val);
+ const __m128 vmax_val = _mm_set_ps1(max_val);
+
+ unsigned int i;
+ for(i = 0; i < sse_iters; i++)
+ {
+ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+ __VOLK_PREFETCH(inputVectorPtr + 8);
+
+ // Clip
+ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ for(i = sse_iters * 8; i < num_points * 2; i++)
+ {
+ aux = *inputVectorPtr++;
+ if(aux > max_val)
+ aux = max_val;
+ else if(aux < min_val)
+ aux = min_val;
+ *outputVectorPtr++ = (int16_t)rintf(aux);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 4;
+
+ float* inputVectorPtr = (float*)inputVector;
+ int16_t* outputVectorPtr = (int16_t*)outputVector;
+ float aux;
+
+ const float min_val = (float)SHRT_MIN;
+ const float max_val = (float)SHRT_MAX;
+
+ __m128 inputVal1, inputVal2;
+ __m128i intInputVal1, intInputVal2;
+ __m128 ret1, ret2;
+ const __m128 vmin_val = _mm_set_ps1(min_val);
+ const __m128 vmax_val = _mm_set_ps1(max_val);
+ unsigned int i;
+
+ for(i = 0; i < sse_iters; i++)
+ {
+ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+ __VOLK_PREFETCH(inputVectorPtr + 8);
+
+ // Clip
+ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ for(i = sse_iters * 8; i < num_points * 2; i++)
+ {
+ aux = *inputVectorPtr++;
+ if(aux > max_val)
+ aux = max_val;
+ else if(aux < min_val)
+ aux = min_val;
+ *outputVectorPtr++ = (int16_t)rintf(aux);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+{
+ const unsigned int neon_iters = num_points / 4;
+
+ float32_t* inputVectorPtr = (float32_t*)inputVector;
+ int16_t* outputVectorPtr = (int16_t*)outputVector;
+
+ const float min_val_f = (float)SHRT_MIN;
+ const float max_val_f = (float)SHRT_MAX;
+ float32_t aux;
+ unsigned int i;
+
+ const float32x4_t min_val = vmovq_n_f32(min_val_f);
+ const float32x4_t max_val = vmovq_n_f32(max_val_f);
+ float32x4_t half = vdupq_n_f32(0.5f);
+ float32x4_t ret1, ret2, a, b, sign, PlusHalf, Round;
+
+ int32x4_t toint_a, toint_b;
+ int16x4_t intInputVal1, intInputVal2;
+ int16x8_t res;
+
+ for(i = 0; i < neon_iters; i++)
+ {
+ a = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4;
+ b = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4;
+ __VOLK_PREFETCH(inputVectorPtr + 8);
+
+ ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
+ ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
+
+ /* in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); */
+ sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
+ PlusHalf = vaddq_f32(ret1, half);
+ Round = vsubq_f32(PlusHalf, sign);
+ toint_a = vcvtq_s32_f32(Round);
+
+ sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret2), 31)));
+ PlusHalf = vaddq_f32(ret2, half);
+ Round = vsubq_f32(PlusHalf, sign);
+ toint_b = vcvtq_s32_f32(Round);
+
+ intInputVal1 = vqmovn_s32(toint_a);
+ intInputVal2 = vqmovn_s32(toint_b);
+
+ res = vcombine_s16(intInputVal1, intInputVal2);
+ vst1q_s16((int16_t*)outputVectorPtr, res);
+ outputVectorPtr += 8;
+ }
+
+ for(i = neon_iters * 8; i < num_points * 2; i++)
+ {
+ aux = *inputVectorPtr++;
+ if(aux > max_val_f)
+ aux = max_val_f;
+ else if(aux < min_val_f)
+ aux = min_val_f;
+ *outputVectorPtr++ = (int16_t)rintf(aux);
+ }
+}
+
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+{
+ float* inputVectorPtr = (float*)inputVector;
+ int16_t* outputVectorPtr = (int16_t*)outputVector;
+ const float min_val = (float)SHRT_MIN;
+ const float max_val = (float)SHRT_MAX;
+ float aux;
+ unsigned int i;
+ for(i = 0; i < num_points * 2; i++)
+ {
+ aux = *inputVectorPtr++;
+ if(aux > max_val)
+ aux = max_val;
+ else if(aux < min_val)
+ aux = min_val;
+ *outputVectorPtr++ = (int16_t)rintf(aux);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_convert_16ic_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_deinterleave_32f_x2
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex floating point vector into I & Q vector
+ * data.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data.
+ * \li qBuffer: The Q buffer output data.
+ *
+ * \b Example
+ * Generate complex numbers around the top half of the unit circle and
+ * deinterleave in to real and imaginary buffers.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * float* re = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* im = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * float real = 2.f * ((float)ii / (float)N) - 1.f;
+ * float imag = std::sqrt(1.f - real * real);
+ * in[ii] = lv_cmake(real, imag);
+ * }
+ *
+ * volk_32fc_deinterleave_32f_x2(re, im, in, N);
+ *
+ * printf(" re | im\n");
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out(%i) = %+.1f | %+.1f\n", ii, re[ii], im[ii]);
+ * }
+ *
+ * volk_free(in);
+ * volk_free(re);
+ * volk_free(im);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_a_H
+#define INCLUDED_volk_32fc_deinterleave_32f_x2_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+static inline void
+volk_32fc_deinterleave_32f_x2_a_avx(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ unsigned int number = 0;
+ // Mask for real and imaginary parts
+ const unsigned int eighthPoints = num_points / 8;
+ __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
+ for(;number < eighthPoints; number++){
+ cplxValue1 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ cplxValue2 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
+ // Arrange in q1q2q3q4 format
+ qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
+
+ _mm256_store_ps(iBufferPtr, iValue);
+ _mm256_store_ps(qBufferPtr, qValue);
+
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 cplxValue1, cplxValue2, iValue, qValue;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ _mm_store_ps(iBufferPtr, iValue);
+ _mm_store_ps(qBufferPtr, qValue);
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32fc_deinterleave_32f_x2_neon(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+ float32x4x2_t complexInput;
+
+ for(number = 0; number < quarter_points; number++){
+ complexInput = vld2q_f32(complexVectorPtr);
+ vst1q_f32( iBufferPtr, complexInput.val[0] );
+ vst1q_f32( qBufferPtr, complexInput.val[1] );
+ complexVectorPtr += 8;
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ for(number = quarter_points*4; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+ unsigned int number;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_deinterleave_64f_x2
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex floating point vector into I & Q vector
+ * data. The output vectors are converted to doubles.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_deinterleave_64f_x2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data.
+ * \li qBuffer: The Q buffer output data.
+ *
+ * \b Example
+ * Generate complex numbers around the top half of the unit circle and
+ * deinterleave in to real and imaginary double buffers.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * double* re = (double*)volk_malloc(sizeof(double)*N, alignment);
+ * double* im = (double*)volk_malloc(sizeof(double)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * float real = 2.f * ((float)ii / (float)N) - 1.f;
+ * float imag = std::sqrt(1.f - real * real);
+ * in[ii] = lv_cmake(real, imag);
+ * }
+ *
+ * volk_32fc_deinterleave_64f_x2(re, im, in, N);
+ *
+ * printf(" re | im\n");
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out(%i) = %+.1g | %+.1g\n", ii, re[ii], im[ii]);
+ * }
+ *
+ * volk_free(in);
+ * volk_free(re);
+ * volk_free(im);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_u_H
+#define INCLUDED_volk_32fc_deinterleave_64f_x2_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32fc_deinterleave_64f_x2_u_avx(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ const unsigned int quarterPoints = num_points / 4;
+ __m256 cplxValue;
+ __m128 complexH, complexL, fVal;
+ __m256d dVal;
+
+ for(;number < quarterPoints; number++){
+
+ cplxValue = _mm256_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ complexH = _mm256_extractf128_ps(cplxValue, 1);
+ complexL = _mm256_extractf128_ps(cplxValue, 0);
+
+ // Arrange in i1i2i1i2 format
+ fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2,0,2,0));
+ dVal = _mm256_cvtps_pd(fVal);
+ _mm256_storeu_pd(iBufferPtr, dVal);
+
+ // Arrange in q1q2q1q2 format
+ fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3,1,3,1));
+ dVal = _mm256_cvtps_pd(fVal);
+ _mm256_storeu_pd(qBufferPtr, dVal);
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void
+volk_32fc_deinterleave_64f_x2_u_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ const unsigned int halfPoints = num_points / 2;
+ __m128 cplxValue, fVal;
+ __m128d dVal;
+
+ for(;number < halfPoints; number++){
+
+ cplxValue = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i1i2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_storeu_pd(iBufferPtr, dVal);
+
+ // Arrange in q1q2q1q2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_storeu_pd(qBufferPtr, dVal);
+
+ iBufferPtr += 2;
+ qBufferPtr += 2;
+ }
+
+ number = halfPoints * 2;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_deinterleave_64f_x2_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (double)*complexVectorPtr++;
+ *qBufferPtr++ = (double)*complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_u_H */
+#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_a_H
+#define INCLUDED_volk_32fc_deinterleave_64f_x2_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32fc_deinterleave_64f_x2_a_avx(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ const unsigned int quarterPoints = num_points / 4;
+ __m256 cplxValue;
+ __m128 complexH, complexL, fVal;
+ __m256d dVal;
+
+ for(;number < quarterPoints; number++){
+
+ cplxValue = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ complexH = _mm256_extractf128_ps(cplxValue, 1);
+ complexL = _mm256_extractf128_ps(cplxValue, 0);
+
+ // Arrange in i1i2i1i2 format
+ fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2,0,2,0));
+ dVal = _mm256_cvtps_pd(fVal);
+ _mm256_store_pd(iBufferPtr, dVal);
+
+ // Arrange in q1q2q1q2 format
+ fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3,1,3,1));
+ dVal = _mm256_cvtps_pd(fVal);
+ _mm256_store_pd(qBufferPtr, dVal);
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void
+volk_32fc_deinterleave_64f_x2_a_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ const unsigned int halfPoints = num_points / 2;
+ __m128 cplxValue, fVal;
+ __m128d dVal;
+
+ for(;number < halfPoints; number++){
+
+ cplxValue = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i1i2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_store_pd(iBufferPtr, dVal);
+
+ // Arrange in q1q2q1q2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_store_pd(qBufferPtr, dVal);
+
+ iBufferPtr += 2;
+ qBufferPtr += 2;
+ }
+
+ number = halfPoints * 2;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_deinterleave_64f_x2_a_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (double)*complexVectorPtr++;
+ *qBufferPtr++ = (double)*complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_deinterleave_imag_32f
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex floating point vector and return the imaginary
+ * part (quadrature) of the samples.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_deinterleave_image_32f(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li qBuffer: The Q buffer output data.
+ *
+ * \b Example
+ * Generate complex numbers around the top half of the unit circle and
+ * extract all of the imaginary parts to a float buffer.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * float* im = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * float real = 2.f * ((float)ii / (float)N) - 1.f;
+ * float imag = std::sqrt(1.f - real * real);
+ * in[ii] = lv_cmake(real, imag);
+ * }
+ *
+ * volk_32fc_deinterleave_imag_32f(im, in, N);
+ *
+ * printf(" imaginary part\n");
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out(%i) = %+.1f\n", ii, im[ii]);
+ * }
+ *
+ * volk_free(in);
+ * volk_free(im);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_deinterleave_imag_32f_a_H
+#define INCLUDED_volk_32fc_deinterleave_imag_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32fc_deinterleave_imag_32f_a_avx(float* qBuffer, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+ const float* complexVectorPtr = (const float*)complexVector;
+ float* qBufferPtr = qBuffer;
+
+ __m256 cplxValue1, cplxValue2, complex1, complex2, qValue;
+ for(;number < eighthPoints; number++){
+
+ cplxValue1 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ cplxValue2 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+
+ // Arrange in q1q2q3q4 format
+ qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
+
+ _mm256_store_ps(qBufferPtr, qValue);
+
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (const float*)complexVector;
+ float* qBufferPtr = qBuffer;
+
+ __m128 cplxValue1, cplxValue2, iValue;
+ for(;number < quarterPoints; number++){
+
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in q1q2q3q4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ _mm_store_ps(qBufferPtr, iValue);
+
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32fc_deinterleave_imag_32f_neon(float* qBuffer, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* qBufferPtr = qBuffer;
+ float32x4x2_t complexInput;
+
+ for(number = 0; number < quarter_points; number++){
+ complexInput = vld2q_f32(complexVectorPtr);
+ vst1q_f32( qBufferPtr, complexInput.val[1] );
+ complexVectorPtr += 8;
+ qBufferPtr += 4;
+ }
+
+ for(number = quarter_points*4; number < num_points; number++){
+ complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_deinterleave_imag_32f_generic(float* qBuffer, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* qBufferPtr = qBuffer;
+ for(number = 0; number < num_points; number++){
+ complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_imag_32f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_deinterleave_real_32f
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex floating point vector and return the real
+ * part (inphase) of the samples.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_deinterleave_real_32f(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data.
+ *
+ * \b Example
+ * Generate complex numbers around the top half of the unit circle and
+ * extract all of the real parts to a float buffer.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * float* re = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * float real = 2.f * ((float)ii / (float)N) - 1.f;
+ * float imag = std::sqrt(1.f - real * real);
+ * in[ii] = lv_cmake(real, imag);
+ * }
+ *
+ * volk_32fc_deinterleave_real_32f(re, in, N);
+ *
+ * printf(" real part\n");
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out(%i) = %+.1f\n", ii, re[ii]);
+ * }
+ *
+ * volk_free(in);
+ * volk_free(re);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_deinterleave_real_32f_a_H
+#define INCLUDED_volk_32fc_deinterleave_real_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (const float*)complexVector;
+ float* iBufferPtr = iBuffer;
+
+ __m128 cplxValue1, cplxValue2, iValue;
+ for(;number < quarterPoints; number++){
+
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+
+ _mm_store_ps(iBufferPtr, iValue);
+
+ iBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_deinterleave_real_32f_generic(float* iBuffer, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32fc_deinterleave_real_32f_neon(float* iBuffer, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float32x4x2_t complexInput;
+
+ for(number = 0; number < quarter_points; number++){
+ complexInput = vld2q_f32(complexVectorPtr);
+ vst1q_f32( iBufferPtr, complexInput.val[0] );
+ complexVectorPtr += 8;
+ iBufferPtr += 4;
+ }
+
+ for(number = quarter_points*4; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+#endif /* INCLUDED_volk_32fc_deinterleave_real_32f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_deinterleave_real_64f
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex floating point vector and return the real
+ * part (inphase) of the samples that have been converted to doubles.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_deinterleave_real_64f(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data.
+ *
+ * \b Example
+ * \code
+ * Generate complex numbers around the top half of the unit circle and
+ * extract all of the real parts to a double buffer.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * double* re = (double*)volk_malloc(sizeof(double)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * float real = 2.f * ((float)ii / (float)N) - 1.f;
+ * float imag = std::sqrt(1.f - real * real);
+ * in[ii] = lv_cmake(real, imag);
+ * }
+ *
+ * volk_32fc_deinterleave_real_64f(re, in, N);
+ *
+ * printf(" real part\n");
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out(%i) = %+.1g\n", ii, re[ii]);
+ * }
+ *
+ * volk_free(in);
+ * volk_free(re);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_deinterleave_real_64f_a_H
+#define INCLUDED_volk_32fc_deinterleave_real_64f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void
+volk_32fc_deinterleave_real_64f_a_sse2(double* iBuffer, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+
+ const unsigned int halfPoints = num_points / 2;
+ __m128 cplxValue, fVal;
+ __m128d dVal;
+ for(;number < halfPoints; number++){
+
+ cplxValue = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i1i2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_store_pd(iBufferPtr, dVal);
+
+ iBufferPtr += 2;
+ }
+
+ number = halfPoints * 2;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (double)*complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_deinterleave_real_64f_generic(double* iBuffer, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (double)*complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_deinterleave_real_64f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_index_max_16u
+ *
+ * \b Overview
+ *
+ * Returns Argmax_i mag(x[i]). Finds and returns the index which contains the
+ * maximum magnitude for complex points in the given vector.
+ *
+ * Note that num_points is a uint32_t, but the return value is
+ * uint16_t. Providing a vector larger than the max of a uint16_t
+ * (65536) would miss anything outside of this boundary. The kernel
+ * will check the length of num_points and cap it to this max value,
+ * anyways.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_index_max_16u(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li src0: The complex input vector.
+ * \li num_points: The number of samples.
+ *
+ * \b Outputs
+ * \li target: The index of the point with maximum magnitude.
+ *
+ * \b Example
+ * Calculate the index of the maximum value of \f$x^2 + x\f$ for points around
+ * the unit circle.
+ * \code
+ * int N = 10;
+ * uint32_t alignment = volk_get_alignment();
+ * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * uint16_t* max = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment);
+ *
+ * for(uint32_t ii = 0; ii < N/2; ++ii){
+ * float real = 2.f * ((float)ii / (float)N) - 1.f;
+ * float imag = std::sqrt(1.f - real * real);
+ * in[ii] = lv_cmake(real, imag);
+ * in[ii] = in[ii] * in[ii] + in[ii];
+ * in[N-ii] = lv_cmake(real, imag);
+ * in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
+ * }
+ *
+ * volk_32fc_index_max_16u(max, in, N);
+ *
+ * printf("index of max value = %u\n", *max);
+ *
+ * volk_free(in);
+ * volk_free(max);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_index_max_16u_a_H
+#define INCLUDED_volk_32fc_index_max_16u_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <limits.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_SSE3
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+
+static inline void
+volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0,
+ uint32_t num_points)
+{
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+ // Branchless version, if we think it'll make a difference
+ //num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
+
+ const uint32_t num_bytes = num_points*8;
+
+ union bit128 holderf;
+ union bit128 holderi;
+ float sq_dist = 0.0;
+
+ union bit128 xmm5, xmm4;
+ __m128 xmm1, xmm2, xmm3;
+ __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+
+ xmm5.int_vec = xmmfive = _mm_setzero_si128();
+ xmm4.int_vec = xmmfour = _mm_setzero_si128();
+ holderf.int_vec = holder0 = _mm_setzero_si128();
+ holderi.int_vec = holder1 = _mm_setzero_si128();
+
+ int bound = num_bytes >> 5;
+ int leftovers0 = (num_bytes >> 4) & 1;
+ int leftovers1 = (num_bytes >> 3) & 1;
+ int i = 0;
+
+ xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order!
+ xmm9 = _mm_setzero_si128();
+ xmm10 = _mm_set_epi32(4, 4, 4, 4);
+ xmm3 = _mm_setzero_ps();
+ //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
+
+ for(; i < bound; ++i) {
+ xmm1 = _mm_load_ps((float*)src0);
+ xmm2 = _mm_load_ps((float*)&src0[2]);
+
+ src0 += 4;
+
+ xmm1 = _mm_mul_ps(xmm1, xmm1);
+ xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+ xmm1 = _mm_hadd_ps(xmm1, xmm2);
+
+ xmm3 = _mm_max_ps(xmm1, xmm3);
+
+ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+
+ xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+ xmm8 = _mm_add_epi32(xmm8, xmm10);
+
+ //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
+ //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]);
+ }
+
+
+ for(i = 0; i < leftovers0; ++i) {
+ xmm2 = _mm_load_ps((float*)src0);
+
+ xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
+ xmm8 = bit128_p(&xmm1)->int_vec;
+
+ xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+ src0 += 2;
+
+ xmm1 = _mm_hadd_ps(xmm2, xmm2);
+
+ xmm3 = _mm_max_ps(xmm1, xmm3);
+
+ xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]);
+
+ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+
+ xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+ xmm8 = _mm_add_epi32(xmm8, xmm10);
+ //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+ }
+
+ for(i = 0; i < leftovers1; ++i) {
+ //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+
+ sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
+
+ xmm2 = _mm_load1_ps(&sq_dist);
+
+ xmm1 = xmm3;
+
+ xmm3 = _mm_max_ss(xmm3, xmm2);
+
+ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+ xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
+
+ xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
+ xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
+
+ xmm9 = _mm_add_epi32(xmm11, xmm12);
+ }
+
+ //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
+ //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+
+ _mm_store_ps((float*)&(holderf.f), xmm3);
+ _mm_store_si128(&(holderi.int_vec), xmm9);
+
+ target[0] = holderi.i[0];
+ sq_dist = holderf.f[0];
+ target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+ sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+ target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+
+ /*
+ float placeholder = 0.0;
+ uint32_t temp0, temp1;
+ uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
+ uint32_t l0 = g0 ^ 1;
+
+ uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
+ uint32_t l1 = g1 ^ 1;
+
+ temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
+ temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
+ sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
+ placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
+
+ g0 = (sq_dist > placeholder);
+ l0 = g0 ^ 1;
+ target[0] = g0 * temp0 + l0 * temp1;
+ */
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_GENERIC
+static inline void
+ volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0,
+ uint32_t num_points)
+{
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+ const uint32_t num_bytes = num_points*8;
+
+ float sq_dist = 0.0;
+ float max = 0.0;
+ uint16_t index = 0;
+
+ uint32_t i = 0;
+
+ for(; i < num_bytes >> 3; ++i) {
+ sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
+
+ index = sq_dist > max ? i : index;
+ max = sq_dist > max ? sq_dist : max;
+ }
+ target[0] = index;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2016 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_index_max_32u
+ *
+ * \b Overview
+ *
+ * Returns Argmax_i mag(x[i]). Finds and returns the index which contains the
+ * maximum magnitude for complex points in the given vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_index_max_32u(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li src0: The complex input vector.
+ * \li num_points: The number of samples.
+ *
+ * \b Outputs
+ * \li target: The index of the point with maximum magnitude.
+ *
+ * \b Example
+ * Calculate the index of the maximum value of \f$x^2 + x\f$ for points around
+ * the unit circle.
+ * \code
+ * int N = 10;
+ * uint32_t alignment = volk_get_alignment();
+ * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * uint32_t* max = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment);
+ *
+ * for(uint32_t ii = 0; ii < N/2; ++ii){
+ * float real = 2.f * ((float)ii / (float)N) - 1.f;
+ * float imag = std::sqrt(1.f - real * real);
+ * in[ii] = lv_cmake(real, imag);
+ * in[ii] = in[ii] * in[ii] + in[ii];
+ * in[N-ii] = lv_cmake(real, imag);
+ * in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
+ * }
+ *
+ * volk_32fc_index_max_32u(max, in, N);
+ *
+ * printf("index of max value = %u\n", *max);
+ *
+ * volk_free(in);
+ * volk_free(max);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_index_max_32u_a_H
+#define INCLUDED_volk_32fc_index_max_32u_a_H
+
+#include <volk/volk_common.h>
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+
+#ifdef LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+static inline void
+volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0,
+ uint32_t num_points)
+{
+ const uint32_t num_bytes = num_points*8;
+
+ union bit128 holderf;
+ union bit128 holderi;
+ float sq_dist = 0.0;
+
+ union bit128 xmm5, xmm4;
+ __m128 xmm1, xmm2, xmm3;
+ __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+
+ xmm5.int_vec = xmmfive = _mm_setzero_si128();
+ xmm4.int_vec = xmmfour = _mm_setzero_si128();
+ holderf.int_vec = holder0 = _mm_setzero_si128();
+ holderi.int_vec = holder1 = _mm_setzero_si128();
+
+ int bound = num_bytes >> 5;
+ int leftovers0 = (num_bytes >> 4) & 1;
+ int leftovers1 = (num_bytes >> 3) & 1;
+ int i = 0;
+
+ xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order!
+ xmm9 = _mm_setzero_si128();
+ xmm10 = _mm_set_epi32(4, 4, 4, 4);
+ xmm3 = _mm_setzero_ps();
+
+ //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
+
+ for(; i < bound; ++i) {
+ xmm1 = _mm_load_ps((float*)src0);
+ xmm2 = _mm_load_ps((float*)&src0[2]);
+
+ src0 += 4;
+
+ xmm1 = _mm_mul_ps(xmm1, xmm1);
+ xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+ xmm1 = _mm_hadd_ps(xmm1, xmm2);
+
+ xmm3 = _mm_max_ps(xmm1, xmm3);
+
+ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+
+ xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+ xmm8 = _mm_add_epi32(xmm8, xmm10);
+
+ //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
+ //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]);
+ }
+
+
+ for(i = 0; i < leftovers0; ++i) {
+ xmm2 = _mm_load_ps((float*)src0);
+
+ xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
+ xmm8 = bit128_p(&xmm1)->int_vec;
+
+ xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+ src0 += 2;
+
+ xmm1 = _mm_hadd_ps(xmm2, xmm2);
+
+ xmm3 = _mm_max_ps(xmm1, xmm3);
+
+ xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]);
+
+ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+
+ xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+ xmm8 = _mm_add_epi32(xmm8, xmm10);
+ //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+ }
+
+ for(i = 0; i < leftovers1; ++i) {
+ //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+
+ sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
+
+ xmm2 = _mm_load1_ps(&sq_dist);
+
+ xmm1 = xmm3;
+
+ xmm3 = _mm_max_ss(xmm3, xmm2);
+
+ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+ xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
+
+ xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
+ xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
+
+ xmm9 = _mm_add_epi32(xmm11, xmm12);
+ }
+
+ //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
+ //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+
+ _mm_store_ps((float*)&(holderf.f), xmm3);
+ _mm_store_si128(&(holderi.int_vec), xmm9);
+
+ target[0] = holderi.i[0];
+ sq_dist = holderf.f[0];
+ target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+ sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+ target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+
+ /*
+ float placeholder = 0.0;
+ uint32_t temp0, temp1;
+ uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
+ uint32_t l0 = g0 ^ 1;
+
+ uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
+ uint32_t l1 = g1 ^ 1;
+
+ temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
+ temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
+ sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
+ placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
+
+ g0 = (sq_dist > placeholder);
+ l0 = g0 ^ 1;
+ target[0] = g0 * temp0 + l0 * temp1;
+ */
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_GENERIC
+static inline void
+ volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0,
+ uint32_t num_points)
+{
+ const uint32_t num_bytes = num_points*8;
+
+ float sq_dist = 0.0;
+ float max = 0.0;
+ uint32_t index = 0;
+
+ uint32_t i = 0;
+
+ for(; i < num_bytes >> 3; ++i) {
+ sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
+
+ index = sq_dist > max ? i : index;
+ max = sq_dist > max ? sq_dist : max;
+ }
+ target[0] = index;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32fc_index_max_32u_a_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_magnitude_32f
+ *
+ * \b Overview
+ *
+ * Calculates the magnitude of the complexVector and stores the
+ * results in the magnitudeVector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_magnitude_32f(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of samples.
+ *
+ * \b Outputs
+ * \li magnitudeVector: The output value.
+ *
+ * \b Example
+ * Calculate the magnitude of \f$x^2 + x\f$ for points around the unit circle.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * float* magnitude = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N/2; ++ii){
+ * float real = 2.f * ((float)ii / (float)N) - 1.f;
+ * float imag = std::sqrt(1.f - real * real);
+ * in[ii] = lv_cmake(real, imag);
+ * in[ii] = in[ii] * in[ii] + in[ii];
+ * in[N-ii] = lv_cmake(real, imag);
+ * in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
+ * }
+ *
+ * volk_32fc_magnitude_32f(magnitude, in, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out(%i) = %+.1f\n", ii, magnitude[ii]);
+ * }
+ *
+ * volk_free(in);
+ * volk_free(magnitude);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_magnitude_32f_u_H
+#define INCLUDED_volk_32fc_magnitude_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void
+volk_32fc_magnitude_32f_u_avx(float* magnitudeVector, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* complexVectorPtr = (float*) complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m256 cplxValue1, cplxValue2, result;
+
+ for(; number < eighthPoints; number++){
+ cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+ cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8);
+ result = _mm256_magnitude_ps(cplxValue1, cplxValue2);
+ _mm256_storeu_ps(magnitudeVectorPtr, result);
+
+ complexVectorPtr += 16;
+ magnitudeVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void
+volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*) complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+ for(; number < quarterPoints; number++){
+ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2);
+
+ _mm_storeu_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+#include <volk/volk_sse_intrinsics.h>
+
+static inline void
+volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*) complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+
+ for(; number < quarterPoints; number++){
+ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ result = _mm_magnitude_ps(cplxValue1, cplxValue2);
+ _mm_storeu_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_magnitude_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
+{
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++){
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
+#ifndef INCLUDED_volk_32fc_magnitude_32f_a_H
+#define INCLUDED_volk_32fc_magnitude_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void
+volk_32fc_magnitude_32f_a_avx(float* magnitudeVector, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* complexVectorPtr = (float*) complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m256 cplxValue1, cplxValue2, result;
+ for(; number < eighthPoints; number++){
+ cplxValue1 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ cplxValue2 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ result = _mm256_magnitude_ps(cplxValue1, cplxValue2);
+ _mm256_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void
+volk_32fc_magnitude_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*) complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+ for(; number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2);
+ _mm_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+#include <volk/volk_sse_intrinsics.h>
+
+static inline void
+volk_32fc_magnitude_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*) complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+ for(; number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ result = _mm_magnitude_ps(cplxValue1, cplxValue2);
+ _mm_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_magnitude_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++){
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32fc_magnitude_32f_neon(float* magnitudeVector, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number;
+ unsigned int quarter_points = num_points / 4;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ float32x4x2_t complex_vec;
+ float32x4_t magnitude_vec;
+ for(number = 0; number < quarter_points; number++){
+ complex_vec = vld2q_f32(complexVectorPtr);
+ complex_vec.val[0] = vmulq_f32(complex_vec.val[0], complex_vec.val[0]);
+ magnitude_vec = vmlaq_f32(complex_vec.val[0], complex_vec.val[1], complex_vec.val[1]);
+ magnitude_vec = vrsqrteq_f32(magnitude_vec);
+ magnitude_vec = vrecpeq_f32( magnitude_vec ); // no plain ol' sqrt
+ vst1q_f32(magnitudeVectorPtr, magnitude_vec);
+
+ complexVectorPtr += 8;
+ magnitudeVectorPtr += 4;
+ }
+
+ for(number = quarter_points*4; number < num_points; number++){
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_NEON
+/*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+
+ This is an approximation from "Streamlining Digital Signal Processing" by
+ Richard Lyons. Apparently max error is about 1% and mean error is about 0.6%.
+ The basic idea is to do a weighted sum of the abs. value of imag and real parts
+ where weight A is always assigned to max(imag, real) and B is always min(imag,real).
+ There are two pairs of cofficients chosen based on whether min <= 0.4142 max.
+ This method is called equiripple-error magnitude estimation proposed by Filip in '73
+
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void
+volk_32fc_magnitude_32f_neon_fancy_sweet(float* magnitudeVector, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number;
+ unsigned int quarter_points = num_points / 4;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ const float threshold = 0.4142135;
+
+ float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low;
+ a_high = vdupq_n_f32( 0.84 );
+ b_high = vdupq_n_f32( 0.561);
+ a_low = vdupq_n_f32( 0.99 );
+ b_low = vdupq_n_f32( 0.197);
+
+ uint32x4_t comp0, comp1;
+
+ float32x4x2_t complex_vec;
+ float32x4_t min_vec, max_vec, magnitude_vec;
+ float32x4_t real_abs, imag_abs;
+ for(number = 0; number < quarter_points; number++){
+ complex_vec = vld2q_f32(complexVectorPtr);
+
+ real_abs = vabsq_f32(complex_vec.val[0]);
+ imag_abs = vabsq_f32(complex_vec.val[1]);
+
+ min_vec = vminq_f32(real_abs, imag_abs);
+ max_vec = vmaxq_f32(real_abs, imag_abs);
+
+ // effective branch to choose coefficient pair.
+ comp0 = vcgtq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
+ comp1 = vcleq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
+
+ // and 0s or 1s with coefficients from previous effective branch
+ a_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)a_high),
+ vandq_s32((int32x4_t)comp1, (int32x4_t)a_low));
+ b_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)b_high),
+ vandq_s32((int32x4_t)comp1, (int32x4_t)b_low));
+
+ // coefficients chosen, do the weighted sum
+ min_vec = vmulq_f32(min_vec, b_vec);
+ max_vec = vmulq_f32(max_vec, a_vec);
+
+ magnitude_vec = vaddq_f32(min_vec, max_vec);
+ vst1q_f32(magnitudeVectorPtr, magnitude_vec);
+
+ complexVectorPtr += 8;
+ magnitudeVectorPtr += 4;
+ }
+
+ for(number = quarter_points*4; number < num_points; number++){
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_ORC
+
+extern void
+volk_32fc_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector,
+ unsigned int num_points);
+
+static inline void
+volk_32fc_magnitude_32f_u_orc(float* magnitudeVector, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ volk_32fc_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_magnitude_squared_32f
+ *
+ * \b Overview
+ *
+ * Calculates the magnitude squared of the complexVector and stores
+ * the results in the magnitudeVector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_magnitude_squared_32f(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of samples.
+ *
+ * \b Outputs
+ * \li magnitudeVector: The output value.
+ *
+ * \b Example
+ * Calculate the magnitude squared of \f$x^2 + x\f$ for points around the unit circle.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * float* magnitude = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N/2; ++ii){
+ * float real = 2.f * ((float)ii / (float)N) - 1.f;
+ * float imag = std::sqrt(1.f - real * real);
+ * in[ii] = lv_cmake(real, imag);
+ * in[ii] = in[ii] * in[ii] + in[ii];
+ * in[N-ii] = lv_cmake(real, imag);
+ * in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
+ * }
+ *
+ * volk_32fc_magnitude_32f(magnitude, in, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out(%i) = %+.1f\n", ii, magnitude[ii]);
+ * }
+ *
+ * volk_free(in);
+ * volk_free(magnitude);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_u_H
+#define INCLUDED_volk_32fc_magnitude_squared_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void
+volk_32fc_magnitude_squared_32f_u_avx(float* magnitudeVector, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* complexVectorPtr = (float*) complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m256 cplxValue1, cplxValue2, result;
+
+ for(; number < eighthPoints; number++){
+ cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+ cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8);
+ result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2);
+ _mm256_storeu_ps(magnitudeVectorPtr, result);
+
+ complexVectorPtr += 16;
+ magnitudeVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void
+volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*) complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+ for(; number < quarterPoints; number++){
+ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2);
+ _mm_storeu_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+#include <volk/volk_sse_intrinsics.h>
+
+static inline void
+volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*) complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+
+ for(; number < quarterPoints; number++){
+ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2);
+ _mm_storeu_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_magnitude_squared_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++){
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (real*real) + (imag*imag);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
+#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_a_H
+#define INCLUDED_volk_32fc_magnitude_squared_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void
+volk_32fc_magnitude_squared_32f_a_avx(float* magnitudeVector, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* complexVectorPtr = (float*) complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m256 cplxValue1, cplxValue2, result;
+ for(; number < eighthPoints; number++){
+ cplxValue1 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ cplxValue2 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2);
+ _mm256_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void
+volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*) complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+ for(; number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2);
+ _mm_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+#include <volk/volk_sse_intrinsics.h>
+
+static inline void
+volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2);
+ _mm_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32fc_magnitude_squared_32f_neon(float* magnitudeVector, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ float32x4x2_t cmplx_val;
+ float32x4_t result;
+ for(;number < quarterPoints; number++){
+ cmplx_val = vld2q_f32(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ cmplx_val.val[0] = vmulq_f32(cmplx_val.val[0], cmplx_val.val[0]); // Square the values
+ cmplx_val.val[1] = vmulq_f32(cmplx_val.val[1], cmplx_val.val[1]); // Square the values
+
+ result = vaddq_f32(cmplx_val.val[0], cmplx_val.val[1]); // Add the I2 and Q2 values
+
+ vst1q_f32(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++){
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (real*real) + (imag*imag);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_s32f_atan2_32f
+ *
+ * \b Overview
+ *
+ * Computes the arctan for each value in a complex vector and applies
+ * a normalization factor.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_s32f_atan2_32f(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin).
+ * \li normalizeFactor: The atan results are divided by this normalization factor.
+ * \li num_points: The number of complex values in \p inputVector.
+ *
+ * \b Outputs
+ * \li outputVector: The vector where the results will be stored.
+ *
+ * \b Example
+ * Calculate the arctangent of points around the unit circle.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float scale = 1.f; // we want unit circle
+ *
+ * for(unsigned int ii = 0; ii < N/2; ++ii){
+ * // Generate points around the unit circle
+ * float real = -4.f * ((float)ii / (float)N) + 1.f;
+ * float imag = std::sqrt(1.f - real * real);
+ * in[ii] = lv_cmake(real, imag);
+ * in[ii+N/2] = lv_cmake(-real, -imag);
+ * }
+ *
+ * volk_32fc_s32f_atan2_32f(out, in, scale, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("atan2(%1.2f, %1.2f) = %1.2f\n",
+ * lv_cimag(in[ii]), lv_creal(in[ii]), out[ii]);
+ * }
+ *
+ * volk_free(in);
+ * volk_free(out);
+ * \endcode
+ */
+
+
+#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a_H
+#define INCLUDED_volk_32fc_s32f_atan2_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* outPtr = outputVector;
+
+ unsigned int number = 0;
+ const float invNormalizeFactor = 1.0 / normalizeFactor;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 testVector = _mm_set_ps1(2*M_PI);
+ __m128 correctVector = _mm_set_ps1(M_PI);
+ __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
+ __m128 phase;
+ __m128 complex1, complex2, iValue, qValue;
+ __m128 keepMask;
+
+ for (; number < quarterPoints; number++) {
+ // Load IQ data:
+ complex1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+ complex2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+ // Deinterleave IQ data:
+ iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0));
+ qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1));
+ // Arctan to get phase:
+ phase = atan2f4(qValue, iValue);
+ // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
+ // Compare to 2pi:
+ keepMask = _mm_cmpneq_ps(phase,testVector);
+ phase = _mm_blendv_ps(correctVector, phase, keepMask);
+ // done with above correction.
+ phase = _mm_mul_ps(phase, vNormalizeFactor);
+ _mm_store_ps((float*)outPtr, phase);
+ outPtr += 4;
+ }
+ number = quarterPoints * 4;
+#endif /* LV_HAVE_SIMDMATH_H */
+
+ for (; number < num_points; number++) {
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* outPtr = outputVector;
+
+ unsigned int number = 0;
+ const float invNormalizeFactor = 1.0 / normalizeFactor;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 testVector = _mm_set_ps1(2*M_PI);
+ __m128 correctVector = _mm_set_ps1(M_PI);
+ __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
+ __m128 phase;
+ __m128 complex1, complex2, iValue, qValue;
+ __m128 mask;
+ __m128 keepMask;
+
+ for (; number < quarterPoints; number++) {
+ // Load IQ data:
+ complex1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+ complex2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+ // Deinterleave IQ data:
+ iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0));
+ qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1));
+ // Arctan to get phase:
+ phase = atan2f4(qValue, iValue);
+ // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
+ // Compare to 2pi:
+ keepMask = _mm_cmpneq_ps(phase,testVector);
+ phase = _mm_and_ps(phase, keepMask);
+ mask = _mm_andnot_ps(keepMask, correctVector);
+ phase = _mm_or_ps(phase, mask);
+ // done with above correction.
+ phase = _mm_mul_ps(phase, vNormalizeFactor);
+ _mm_store_ps((float*)outPtr, phase);
+ outPtr += 4;
+ }
+ number = quarterPoints * 4;
+#endif /* LV_HAVE_SIMDMATH_H */
+
+ for (; number < num_points; number++) {
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){
+ float* outPtr = outputVector;
+ const float* inPtr = (float*)inputVector;
+ const float invNormalizeFactor = 1.0 / normalizeFactor;
+ unsigned int number;
+ for ( number = 0; number < num_points; number++) {
+ const float real = *inPtr++;
+ const float imag = *inPtr++;
+ *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_s32f_deinterleave_real_16i
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex floating point vector and return the real
+ * part (inphase) of the samples scaled to 16-bit shorts.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_s32f_deinterleave_real_16i(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li scalar: The value to be multiplied against each of the input vectors..
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data.
+ *
+ * \b Example
+ * Generate points around the unit circle and map them to integers with
+ * magnitude 50 to preserve smallest deltas.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * int16_t* out = (int16_t*)volk_malloc(sizeof(int16_t)*N, alignment);
+ * float scale = 50.f;
+ *
+ * for(unsigned int ii = 0; ii < N/2; ++ii){
+ * // Generate points around the unit circle
+ * float real = -4.f * ((float)ii / (float)N) + 1.f;
+ * float imag = std::sqrt(1.f - real * real);
+ * in[ii] = lv_cmake(real, imag);
+ * in[ii+N/2] = lv_cmake(-real, -imag);
+ * }
+ *
+ * volk_32fc_s32f_deinterleave_real_16i(out, in, scale, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out[%u] = %i\n", ii, out[ii]);
+ * }
+ *
+ * volk_free(in);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H
+#define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer, const lv_32fc_t* complexVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+
+ __m128 cplxValue1, cplxValue2, iValue;
+
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+
+ iValue = _mm_mul_ps(iValue, vScalar);
+
+ _mm_store_ps(floatBuffer, iValue);
+ *iBufferPtr++ = (int16_t)(floatBuffer[0]);
+ *iBufferPtr++ = (int16_t)(floatBuffer[1]);
+ *iBufferPtr++ = (int16_t)(floatBuffer[2]);
+ *iBufferPtr++ = (int16_t)(floatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ iBufferPtr = &iBuffer[number];
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+ complexVectorPtr++;
+ }
+}
+
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_s32f_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_32fc_t* complexVector,
+ const float scalar, unsigned int num_points)
+{
+ const float* complexVectorPtr = (float*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+ complexVectorPtr++;
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_s32f_magnitude_16i
+ *
+ * \b Overview
+ *
+ * Calculates the magnitude of the complexVector and stores the
+ * results in the magnitudeVector. The results are scaled and
+ * converted into 16-bit shorts.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_s32f_magnitude_16i(int16_t* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of samples.
+ *
+ * \b Outputs
+ * \li magnitudeVector: The output value as 16-bit shorts.
+ *
+ * \b Example
+ * Generate points around the unit circle and map them to integers with
+ * magnitude 50 to preserve smallest deltas.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * int16_t* out = (int16_t*)volk_malloc(sizeof(int16_t)*N, alignment);
+ * float scale = 50.f;
+ *
+ * for(unsigned int ii = 0; ii < N/2; ++ii){
+ * // Generate points around the unit circle
+ * float real = -4.f * ((float)ii / (float)N) + 1.f;
+ * float imag = std::sqrt(1.f - real * real);
+ * in[ii] = lv_cmake(real, imag);
+ * in[ii+N/2] = lv_cmake(-real, -imag);
+ * }
+ *
+ * volk_32fc_s32f_magnitude_16i(out, in, scale, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out[%u] = %i\n", ii, out[ii]);
+ * }
+ *
+ * volk_free(in);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
+#define INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+static inline void
+volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (const float*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+
+ __m128 cplxValue1, cplxValue2, result;
+
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result);
+
+ result = _mm_mul_ps(result, vScalar);
+
+ _mm_store_ps(floatBuffer, result);
+ *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]);
+ *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]);
+ *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]);
+ *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (const float*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+
+ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result);
+
+ result = _mm_mul_ps(result, vScalar);
+
+ _mm_store_ps(floatBuffer, result);
+ *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]);
+ *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]);
+ *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]);
+ *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ magnitudeVectorPtr = &magnitudeVector[number];
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_s32f_magnitude_16i_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
+ const float scalar, unsigned int num_points)
+{
+ const float* complexVectorPtr = (float*)complexVector;
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++){
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_ORC
+
+extern void
+volk_32fc_s32f_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
+ const float scalar, unsigned int num_points);
+
+static inline void
+volk_32fc_s32f_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
+ const float scalar, unsigned int num_points)
+{
+ volk_32fc_s32f_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, scalar, num_points);
+}
+
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_s32f_power_32fc
+ *
+ * \b Overview
+ *
+ * Takes each the input complex vector value to the specified power
+ * and stores the results in the return vector. The output is scaled
+ * and converted to 16-bit shorts.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_s32f_power_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The complex input vector.
+ * \li power: The power value to be applied to each data point.
+ * \li num_points: The number of samples.
+ *
+ * \b Outputs
+ * \li cVector: The output value as 16-bit shorts.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_32fc_s32f_power_32fc();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_s32f_power_32fc_a_H
+#define INCLUDED_volk_32fc_s32f_power_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+//! raise a complex float to a real float power
+static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp, const float power)
+{
+ const float arg = power*atan2f(lv_creal(exp), lv_cimag(exp));
+ const float mag = powf(lv_creal(exp)*lv_creal(exp) + lv_cimag(exp)*lv_cimag(exp), power/2);
+ return mag*lv_cmake(-cosf(arg), sinf(arg));
+}
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+static inline void
+volk_32fc_s32f_power_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const float power, unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 vPower = _mm_set_ps1(power);
+
+ __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue;
+ for(;number < quarterPoints; number++){
+
+ cplxValue1 = _mm_load_ps((float*)aPtr);
+ aPtr += 2;
+
+ cplxValue2 = _mm_load_ps((float*)aPtr);
+ aPtr += 2;
+
+ // Convert to polar coordinates
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ phase = atan2f4(qValue, iValue); // Calculate the Phase
+
+ magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values
+
+ // Now calculate the power of the polar coordinate data
+ magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power
+
+ phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power
+
+ // Convert back to cartesian coordinates
+ iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude
+ qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude
+
+ cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values
+ cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values
+
+ _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container
+
+ cPtr += 2;
+
+ _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container
+
+ cPtr += 2;
+ }
+
+ number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+ for(;number < num_points; number++){
+ *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_s32f_power_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const float power, unsigned int num_points)
+{
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power);
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_s32f_power_32fc_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_s32f_power_spectrum_32f
+ *
+ * \b Overview
+ *
+ * Calculates the log10 power value for each input point.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_s32f_power_spectrum_32f(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li complexFFTInput The complex data output from the FFT point.
+ * \li normalizationFactor: This value is divided against all the input values before the power is calculated.
+ * \li num_points: The number of fft data points.
+ *
+ * \b Outputs
+ * \li logPowerOutput: The 10.0 * log10(r*r + i*i) for each data point.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_32fc_s32f_power_spectrum_32f();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H
+#define INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+static inline void
+volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput,
+ const float normalizationFactor, unsigned int num_points)
+{
+ const float* inputPtr = (const float*)complexFFTInput;
+ float* destPtr = logPowerOutput;
+ uint64_t number = 0;
+ const float iNormalizationFactor = 1.0 / normalizationFactor;
+#ifdef LV_HAVE_LIB_SIMDMATH
+ __m128 magScalar = _mm_set_ps1(10.0);
+ magScalar = _mm_div_ps(magScalar, logf4(magScalar));
+
+ __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
+
+ __m128 power;
+ __m128 input1, input2;
+ const uint64_t quarterPoints = num_points / 4;
+ for(;number < quarterPoints; number++){
+ // Load the complex values
+ input1 =_mm_load_ps(inputPtr);
+ inputPtr += 4;
+ input2 =_mm_load_ps(inputPtr);
+ inputPtr += 4;
+
+ // Apply the normalization factor
+ input1 = _mm_mul_ps(input1, invNormalizationFactor);
+ input2 = _mm_mul_ps(input2, invNormalizationFactor);
+
+ // Multiply each value by itself
+ // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+ input1 = _mm_mul_ps(input1, input1);
+ // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+ input2 = _mm_mul_ps(input2, input2);
+
+ // Horizontal add, to add (r*r) + (i*i) for each complex value
+ // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+ power = _mm_hadd_ps(input1, input2);
+
+ // Calculate the natural log power
+ power = logf4(power);
+
+ // Convert to log10 and multiply by 10.0
+ power = _mm_mul_ps(power, magScalar);
+
+ // Store the floating point results
+ _mm_store_ps(destPtr, power);
+
+ destPtr += 4;
+ }
+
+ number = quarterPoints*4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+ // Calculate the FFT for any remaining points
+
+ for(; number < num_points; number++){
+ // Calculate dBm
+ // 50 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+ // 75 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+ const float real = *inputPtr++ * iNormalizationFactor;
+ const float imag = *inputPtr++ * iNormalizationFactor;
+
+ *destPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20);
+
+ destPtr++;
+ }
+
+}
+#endif /* LV_HAVE_SSE3 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_s32f_power_spectrum_32f_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput,
+ const float normalizationFactor, unsigned int num_points)
+{
+ // Calculate the Power of the complex point
+ const float* inputPtr = (float*)complexFFTInput;
+ float* realFFTDataPointsPtr = logPowerOutput;
+ const float iNormalizationFactor = 1.0 / normalizationFactor;
+ unsigned int point;
+ for(point = 0; point < num_points; point++){
+ // Calculate dBm
+ // 50 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+ // 75 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+ const float real = *inputPtr++ * iNormalizationFactor;
+ const float imag = *inputPtr++ * iNormalizationFactor;
+
+ *realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20);
+ realFFTDataPointsPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_s32f_x2_power_spectral_density_32f
+ *
+ * \b Overview
+ *
+ * Calculates the log10 power value divided by the RBW for each input point.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_s32f_x2_power_spectral_density_32f(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li complexFFTInput The complex data output from the FFT point.
+ * \li normalizationFactor: This value is divided against all the input values before the power is calculated.
+ * \li rbw: The resolution bandwith of the fft spectrum
+ * \li num_points: The number of fft data points.
+ *
+ * \b Outputs
+ * \li logPowerOutput: The 10.0 * log10((r*r + i*i)/RBW) for each data point.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_32fc_s32f_x2_power_spectral_density_32f();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H
+#define INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+static inline void
+volk_32fc_s32f_x2_power_spectral_density_32f_a_avx(float* logPowerOutput, const lv_32fc_t* complexFFTInput,
+ const float normalizationFactor, const float rbw,
+ unsigned int num_points)
+{
+ const float* inputPtr = (const float*)complexFFTInput;
+ float* destPtr = logPowerOutput;
+ uint64_t number = 0;
+ const float iRBW = 1.0 / rbw;
+ const float iNormalizationFactor = 1.0 / normalizationFactor;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+ __m256 magScalar = _mm256_set1_ps(10.0);
+ magScalar = _mm256_div_ps(magScalar, logf4(magScalar));
+
+ __m256 invRBW = _mm256_set1_ps(iRBW);
+
+ __m256 invNormalizationFactor = _mm256_set1_ps(iNormalizationFactor);
+
+ __m256 power;
+ __m256 input1, input2;
+ const uint64_t eighthPoints = num_points / 8;
+ for(;number < eighthPoints; number++){
+ // Load the complex values
+ input1 =_mm256_load_ps(inputPtr);
+ inputPtr += 8;
+ input2 =_mm256_load_ps(inputPtr);
+ inputPtr += 8;
+
+ // Apply the normalization factor
+ input1 = _mm256_mul_ps(input1, invNormalizationFactor);
+ input2 = _mm256_mul_ps(input2, invNormalizationFactor);
+
+ // Multiply each value by itself
+ // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+ input1 = _mm256_mul_ps(input1, input1);
+ // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+ input2 = _mm256_mul_ps(input2, input2);
+
+ // Horizontal add, to add (r*r) + (i*i) for each complex value
+ // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+ inputVal1 = _mm256_permute2f128_ps(input1, input2, 0x20);
+ inputVal2 = _mm256_permute2f128_ps(input1, input2, 0x31);
+
+ power = _mm256_hadd_ps(inputVal1, inputVal2);
+
+ // Divide by the rbw
+ power = _mm256_mul_ps(power, invRBW);
+
+ // Calculate the natural log power
+ power = logf4(power);
+
+ // Convert to log10 and multiply by 10.0
+ power = _mm256_mul_ps(power, magScalar);
+
+ // Store the floating point results
+ _mm256_store_ps(destPtr, power);
+
+ destPtr += 8;
+ }
+
+ number = eighthPoints*8;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+ // Calculate the FFT for any remaining points
+ for(; number < num_points; number++){
+ // Calculate dBm
+ // 50 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+ // 75 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+ const float real = *inputPtr++ * iNormalizationFactor;
+ const float imag = *inputPtr++ * iNormalizationFactor;
+
+ *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
+ destPtr++;
+ }
+
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+static inline void
+volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput,
+ const float normalizationFactor, const float rbw,
+ unsigned int num_points)
+{
+ const float* inputPtr = (const float*)complexFFTInput;
+ float* destPtr = logPowerOutput;
+ uint64_t number = 0;
+ const float iRBW = 1.0 / rbw;
+ const float iNormalizationFactor = 1.0 / normalizationFactor;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+ __m128 magScalar = _mm_set_ps1(10.0);
+ magScalar = _mm_div_ps(magScalar, logf4(magScalar));
+
+ __m128 invRBW = _mm_set_ps1(iRBW);
+
+ __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
+
+ __m128 power;
+ __m128 input1, input2;
+ const uint64_t quarterPoints = num_points / 4;
+ for(;number < quarterPoints; number++){
+ // Load the complex values
+ input1 =_mm_load_ps(inputPtr);
+ inputPtr += 4;
+ input2 =_mm_load_ps(inputPtr);
+ inputPtr += 4;
+
+ // Apply the normalization factor
+ input1 = _mm_mul_ps(input1, invNormalizationFactor);
+ input2 = _mm_mul_ps(input2, invNormalizationFactor);
+
+ // Multiply each value by itself
+ // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+ input1 = _mm_mul_ps(input1, input1);
+ // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+ input2 = _mm_mul_ps(input2, input2);
+
+ // Horizontal add, to add (r*r) + (i*i) for each complex value
+ // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+ power = _mm_hadd_ps(input1, input2);
+
+ // Divide by the rbw
+ power = _mm_mul_ps(power, invRBW);
+
+ // Calculate the natural log power
+ power = logf4(power);
+
+ // Convert to log10 and multiply by 10.0
+ power = _mm_mul_ps(power, magScalar);
+
+ // Store the floating point results
+ _mm_store_ps(destPtr, power);
+
+ destPtr += 4;
+ }
+
+ number = quarterPoints*4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+ // Calculate the FFT for any remaining points
+ for(; number < num_points; number++){
+ // Calculate dBm
+ // 50 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+ // 75 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+ const float real = *inputPtr++ * iNormalizationFactor;
+ const float imag = *inputPtr++ * iNormalizationFactor;
+
+ *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
+ destPtr++;
+ }
+
+}
+#endif /* LV_HAVE_SSE3 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_s32f_x2_power_spectral_density_32f_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput,
+ const float normalizationFactor, const float rbw,
+ unsigned int num_points)
+{
+ // Calculate the Power of the complex point
+ const float* inputPtr = (float*)complexFFTInput;
+ float* realFFTDataPointsPtr = logPowerOutput;
+ unsigned int point;
+ const float invRBW = 1.0 / rbw;
+ const float iNormalizationFactor = 1.0 / normalizationFactor;
+
+ for(point = 0; point < num_points; point++){
+ // Calculate dBm
+ // 50 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+ // 75 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+ const float real = *inputPtr++ * iNormalizationFactor;
+ const float imag = *inputPtr++ * iNormalizationFactor;
+
+ *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW);
+
+ realFFTDataPointsPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_s32fc_multiply_32fc
+ *
+ * \b Overview
+ *
+ * Multiplies the input complex vector by a complex scalar and returns
+ * the results.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_s32fc_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points);
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector to be multiplied.
+ * \li scalar The complex scalar to multiply against aVector.
+ * \li num_points: The number of complex values in aVector.
+ *
+ * \b Outputs
+ * \li cVector: The vector where the results will be stored.
+ *
+ * \b Example
+ * Generate points around the unit circle and shift the phase pi/3 rad.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * lv_32fc_t scalar = lv_cmake((float)std::cos(M_PI/3.f), (float)std::sin(M_PI/3.f));
+ *
+ * float delta = 2.f*M_PI / (float)N;
+ * for(unsigned int ii = 0; ii < N/2; ++ii){
+ * // Generate points around the unit circle
+ * float real = std::cos(delta * (float)ii);
+ * float imag = std::sin(delta * (float)ii);
+ * in[ii] = lv_cmake(real, imag);
+ * in[ii+N/2] = lv_cmake(-real, -imag);
+ * }
+ *
+ * volk_32fc_s32fc_multiply_32fc(out, in, scalar, N);
+ *
+ * printf(" mag phase | mag phase\n");
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("%+1.2f %+1.2f | %+1.2f %+1.2f\n",
+ * std::abs(in[ii]), std::arg(in[ii]),
+ * std::abs(out[ii]), std::arg(out[ii]));
+ * }
+ *
+ * volk_free(in);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
+#define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ unsigned int number = 0;
+ unsigned int i = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ unsigned int isodd = num_points & 3;
+ __m256 x, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ // Set up constant scalar vector
+ yl = _mm256_set1_ps(lv_creal(scalar));
+ yh = _mm256_set1_ps(lv_cimag(scalar));
+
+ for(;number < quarterPoints; number++){
+ x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+ tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm256_storeu_ps((float*)c,z); // Store the results back into the C container
+
+ a += 4;
+ c += 4;
+ }
+
+ for(i = num_points-isodd; i < num_points; i++) {
+ *c++ = (*a++) * scalar;
+ }
+
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ // Set up constant scalar vector
+ yl = _mm_set_ps1(lv_creal(scalar));
+ yh = _mm_set_ps1(lv_cimag(scalar));
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm_storeu_ps((float*)c,z); // Store the results back into the C container
+
+ a += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = (*a) * scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = num_points;
+
+ // unwrap loop
+ while (number >= 8){
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ number -= 8;
+ }
+
+ // clean up any remaining
+ while (number-- > 0)
+ *cPtr++ = *aPtr++ * scalar;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
+#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
+#define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ unsigned int number = 0;
+ unsigned int i = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ unsigned int isodd = num_points & 3;
+ __m256 x, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ // Set up constant scalar vector
+ yl = _mm256_set1_ps(lv_creal(scalar));
+ yh = _mm256_set1_ps(lv_cimag(scalar));
+
+ for(;number < quarterPoints; number++){
+ x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+ tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm256_store_ps((float*)c,z); // Store the results back into the C container
+
+ a += 4;
+ c += 4;
+ }
+
+ for(i = num_points-isodd; i < num_points; i++) {
+ *c++ = (*a++) * scalar;
+ }
+
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ // Set up constant scalar vector
+ yl = _mm_set_ps1(lv_creal(scalar));
+ yh = _mm_set_ps1(lv_cimag(scalar));
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm_store_ps((float*)c,z); // Store the results back into the C container
+
+ a += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = (*a) * scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = num_points;
+ unsigned int quarter_points = num_points / 4;
+
+ float32x4x2_t a_val, scalar_val;
+ float32x4x2_t tmp_imag;
+
+ scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar);
+ scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1);
+ for(number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)aPtr);
+ tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]);
+ tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]);
+
+ tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]);
+ tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]);
+
+ vst2q_f32((float*)cVector, tmp_imag);
+ aPtr += 4;
+ cVector += 4;
+ }
+
+ for(number = quarter_points*4; number < num_points; number++){
+ *cVector++ = *aPtr++ * scalar;
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = num_points;
+
+ // unwrap loop
+ while (number >= 8){
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ number -= 8;
+ }
+
+ // clean up any remaining
+ while (number-- > 0)
+ *cPtr++ = *aPtr++ * scalar;
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+
+#ifndef INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H
+#define INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H
+
+
+#include <volk/volk_complex.h>
+#include <stdio.h>
+#include <volk/volk_32fc_s32fc_x2_rotator_32fc.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+ lv_32fc_t phase[1] = {lv_cmake(.3, 0.95393)};
+ volk_32fc_s32fc_x2_rotator_32fc_generic(outVector, inVector, phase_inc, phase, num_points);
+
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+ lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
+ volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(outVector, inVector, phase_inc, phase, num_points);
+
+}
+
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+ lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
+ volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(outVector, inVector, phase_inc, phase, num_points);
+
+}
+
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+ lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
+ volk_32fc_s32fc_x2_rotator_32fc_a_avx(outVector, inVector, phase_inc, phase, num_points);
+}
+
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+ lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
+ volk_32fc_s32fc_x2_rotator_32fc_u_avx(outVector, inVector, phase_inc, phase, num_points);
+}
+
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_s32fc_x2_rotator_32fc
+ *
+ * \b Overview
+ *
+ * Rotate input vector at fixed rate per sample from initial phase
+ * offset.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_s32fc_x2_rotator_32fc(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li inVector: Vector to be rotated.
+ * \li phase_inc: rotational velocity.
+ * \li phase: initial phase offset.
+ * \li num_points: The number of values in inVector to be rotated and stored into outVector.
+ *
+ * \b Outputs
+ * \li outVector: The vector where the results will be stored.
+ *
+ * \b Example
+ * Generate a tone at f=0.3 (normalized frequency) and use the rotator with
+ * f=0.1 to shift the tone to f=0.4. Change this example to start with a DC
+ * tone (initialize in with lv_cmake(1, 0)) to observe rotator signal generation.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * // Generate a tone at f=0.3
+ * float real = std::cos(0.3f * (float)ii);
+ * float imag = std::sin(0.3f * (float)ii);
+ * in[ii] = lv_cmake(real, imag);
+ * }
+ * // The oscillator rotates at f=0.1
+ * float frequency = 0.1f;
+ * lv_32fc_t phase_increment = lv_cmake(std::cos(frequency), std::sin(frequency));
+ * lv_32fc_t phase= lv_cmake(1.f, 0.0f); // start at 1 (0 rad phase)
+ *
+ * // rotate so the output is a tone at f=0.4
+ * volk_32fc_s32fc_x2_rotator_32fc(out, in, phase_increment, &phase, N);
+ *
+ * // print results for inspection
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out[%u] = %+1.2f %+1.2fj\n",
+ * ii, lv_creal(out[ii]), lv_cimag(out[ii]));
+ * }
+ *
+ * volk_free(in);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
+#define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
+
+
+#include <volk/volk_complex.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#define ROTATOR_RELOAD 512
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+ unsigned int i = 0;
+ int j = 0;
+ for(i = 0; i < (unsigned int)(num_points/ROTATOR_RELOAD); ++i) {
+ for(j = 0; j < ROTATOR_RELOAD; ++j) {
+ *outVector++ = *inVector++ * (*phase);
+ (*phase) *= phase_inc;
+ }
+#ifdef __cplusplus
+ (*phase) /= std::abs((*phase));
+#else
+ //(*phase) /= cabsf((*phase));
+ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+#endif
+ }
+ for(i = 0; i < num_points%ROTATOR_RELOAD; ++i) {
+ *outVector++ = *inVector++ * (*phase);
+ (*phase) *= phase_inc;
+ }
+
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+ lv_32fc_t* cPtr = outVector;
+ const lv_32fc_t* aPtr = inVector;
+ lv_32fc_t incr = 1;
+ lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
+
+ unsigned int i, j = 0;
+
+ for(i = 0; i < 2; ++i) {
+ phase_Ptr[i] *= incr;
+ incr *= (phase_inc);
+ }
+
+ /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
+ printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
+ __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+ phase_Val = _mm_loadu_ps((float*)phase_Ptr);
+ inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
+
+ const unsigned int halfPoints = num_points / 2;
+
+
+ for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) {
+ for(j = 0; j < ROTATOR_RELOAD; ++j) {
+
+ aVal = _mm_load_ps((float*)aPtr);
+
+ yl = _mm_moveldup_ps(phase_Val);
+ yh = _mm_movehdup_ps(phase_Val);
+ ylp = _mm_moveldup_ps(inc_Val);
+ yhp = _mm_movehdup_ps(inc_Val);
+
+ tmp1 = _mm_mul_ps(aVal, yl);
+ tmp1p = _mm_mul_ps(phase_Val, ylp);
+
+ aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm_mul_ps(aVal, yh);
+ tmp2p = _mm_mul_ps(phase_Val, yhp);
+
+ z = _mm_addsub_ps(tmp1, tmp2);
+ phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
+
+ _mm_store_ps((float*)cPtr, z);
+
+ aPtr += 2;
+ cPtr += 2;
+ }
+ tmp1 = _mm_mul_ps(phase_Val, phase_Val);
+ tmp2 = _mm_hadd_ps(tmp1, tmp1);
+ tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+ tmp2 = _mm_sqrt_ps(tmp1);
+ phase_Val = _mm_div_ps(phase_Val, tmp2);
+ }
+ for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) {
+ aVal = _mm_load_ps((float*)aPtr);
+
+ yl = _mm_moveldup_ps(phase_Val);
+ yh = _mm_movehdup_ps(phase_Val);
+ ylp = _mm_moveldup_ps(inc_Val);
+ yhp = _mm_movehdup_ps(inc_Val);
+
+ tmp1 = _mm_mul_ps(aVal, yl);
+
+ tmp1p = _mm_mul_ps(phase_Val, ylp);
+
+ aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm_mul_ps(aVal, yh);
+ tmp2p = _mm_mul_ps(phase_Val, yhp);
+
+ z = _mm_addsub_ps(tmp1, tmp2);
+ phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
+
+ _mm_store_ps((float*)cPtr, z);
+
+ aPtr += 2;
+ cPtr += 2;
+ }
+ if (i) {
+ tmp1 = _mm_mul_ps(phase_Val, phase_Val);
+ tmp2 = _mm_hadd_ps(tmp1, tmp1);
+ tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+ tmp2 = _mm_sqrt_ps(tmp1);
+ phase_Val = _mm_div_ps(phase_Val, tmp2);
+ }
+
+ _mm_storeu_ps((float*)phase_Ptr, phase_Val);
+ for(i = 0; i < num_points%2; ++i) {
+ *cPtr++ = *aPtr++ * phase_Ptr[0];
+ phase_Ptr[0] *= (phase_inc);
+ }
+
+ (*phase) = phase_Ptr[0];
+
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+ lv_32fc_t* cPtr = outVector;
+ const lv_32fc_t* aPtr = inVector;
+ lv_32fc_t incr = 1;
+ lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
+
+ unsigned int i, j = 0;
+
+ for(i = 0; i < 2; ++i) {
+ phase_Ptr[i] *= incr;
+ incr *= (phase_inc);
+ }
+
+ /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
+ printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
+ __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+ phase_Val = _mm_loadu_ps((float*)phase_Ptr);
+ inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
+
+ const unsigned int halfPoints = num_points / 2;
+
+
+ for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) {
+ for(j = 0; j < ROTATOR_RELOAD; ++j) {
+
+ aVal = _mm_loadu_ps((float*)aPtr);
+
+ yl = _mm_moveldup_ps(phase_Val);
+ yh = _mm_movehdup_ps(phase_Val);
+ ylp = _mm_moveldup_ps(inc_Val);
+ yhp = _mm_movehdup_ps(inc_Val);
+
+ tmp1 = _mm_mul_ps(aVal, yl);
+ tmp1p = _mm_mul_ps(phase_Val, ylp);
+
+ aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm_mul_ps(aVal, yh);
+ tmp2p = _mm_mul_ps(phase_Val, yhp);
+
+ z = _mm_addsub_ps(tmp1, tmp2);
+ phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
+
+ _mm_storeu_ps((float*)cPtr, z);
+
+ aPtr += 2;
+ cPtr += 2;
+ }
+ tmp1 = _mm_mul_ps(phase_Val, phase_Val);
+ tmp2 = _mm_hadd_ps(tmp1, tmp1);
+ tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+ tmp2 = _mm_sqrt_ps(tmp1);
+ phase_Val = _mm_div_ps(phase_Val, tmp2);
+ }
+ for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) {
+ aVal = _mm_loadu_ps((float*)aPtr);
+
+ yl = _mm_moveldup_ps(phase_Val);
+ yh = _mm_movehdup_ps(phase_Val);
+ ylp = _mm_moveldup_ps(inc_Val);
+ yhp = _mm_movehdup_ps(inc_Val);
+
+ tmp1 = _mm_mul_ps(aVal, yl);
+
+ tmp1p = _mm_mul_ps(phase_Val, ylp);
+
+ aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm_mul_ps(aVal, yh);
+ tmp2p = _mm_mul_ps(phase_Val, yhp);
+
+ z = _mm_addsub_ps(tmp1, tmp2);
+ phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
+
+ _mm_storeu_ps((float*)cPtr, z);
+
+ aPtr += 2;
+ cPtr += 2;
+ }
+ if (i) {
+ tmp1 = _mm_mul_ps(phase_Val, phase_Val);
+ tmp2 = _mm_hadd_ps(tmp1, tmp1);
+ tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+ tmp2 = _mm_sqrt_ps(tmp1);
+ phase_Val = _mm_div_ps(phase_Val, tmp2);
+ }
+
+ _mm_storeu_ps((float*)phase_Ptr, phase_Val);
+ for(i = 0; i < num_points%2; ++i) {
+ *cPtr++ = *aPtr++ * phase_Ptr[0];
+ phase_Ptr[0] *= (phase_inc);
+ }
+
+ (*phase) = phase_Ptr[0];
+
+}
+
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+ lv_32fc_t* cPtr = outVector;
+ const lv_32fc_t* aPtr = inVector;
+ lv_32fc_t incr = 1;
+ lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
+
+ unsigned int i, j = 0;
+
+ for(i = 0; i < 4; ++i) {
+ phase_Ptr[i] *= incr;
+ incr *= (phase_inc);
+ }
+
+ /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3]));
+ printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
+ __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+ phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
+ inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
+ const unsigned int fourthPoints = num_points / 4;
+
+
+ for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
+ for(j = 0; j < ROTATOR_RELOAD; ++j) {
+
+ aVal = _mm256_load_ps((float*)aPtr);
+
+ yl = _mm256_moveldup_ps(phase_Val);
+ yh = _mm256_movehdup_ps(phase_Val);
+ ylp = _mm256_moveldup_ps(inc_Val);
+ yhp = _mm256_movehdup_ps(inc_Val);
+
+ tmp1 = _mm256_mul_ps(aVal, yl);
+ tmp1p = _mm256_mul_ps(phase_Val, ylp);
+
+ aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm256_mul_ps(aVal, yh);
+ tmp2p = _mm256_mul_ps(phase_Val, yhp);
+
+ z = _mm256_addsub_ps(tmp1, tmp2);
+ phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
+
+ _mm256_store_ps((float*)cPtr, z);
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+ tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
+ tmp2 = _mm256_hadd_ps(tmp1, tmp1);
+ tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
+ tmp2 = _mm256_sqrt_ps(tmp1);
+ phase_Val = _mm256_div_ps(phase_Val, tmp2);
+ }
+ for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
+ aVal = _mm256_load_ps((float*)aPtr);
+
+ yl = _mm256_moveldup_ps(phase_Val);
+ yh = _mm256_movehdup_ps(phase_Val);
+ ylp = _mm256_moveldup_ps(inc_Val);
+ yhp = _mm256_movehdup_ps(inc_Val);
+
+ tmp1 = _mm256_mul_ps(aVal, yl);
+
+ tmp1p = _mm256_mul_ps(phase_Val, ylp);
+
+ aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm256_mul_ps(aVal, yh);
+ tmp2p = _mm256_mul_ps(phase_Val, yhp);
+
+ z = _mm256_addsub_ps(tmp1, tmp2);
+ phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
+
+ _mm256_store_ps((float*)cPtr, z);
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+ if (i) {
+ tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
+ tmp2 = _mm256_hadd_ps(tmp1, tmp1);
+ tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
+ tmp2 = _mm256_sqrt_ps(tmp1);
+ phase_Val = _mm256_div_ps(phase_Val, tmp2);
+ }
+
+ _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
+ for(i = 0; i < num_points%4; ++i) {
+ *cPtr++ = *aPtr++ * phase_Ptr[0];
+ phase_Ptr[0] *= (phase_inc);
+ }
+
+ (*phase) = phase_Ptr[0];
+
+}
+
+#endif /* LV_HAVE_AVX for aligned */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+ lv_32fc_t* cPtr = outVector;
+ const lv_32fc_t* aPtr = inVector;
+ lv_32fc_t incr = 1;
+ lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
+
+ unsigned int i, j = 0;
+
+ for(i = 0; i < 4; ++i) {
+ phase_Ptr[i] *= incr;
+ incr *= (phase_inc);
+ }
+
+ /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3]));
+ printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
+ __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+ phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
+ inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
+ const unsigned int fourthPoints = num_points / 4;
+
+
+ for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
+ for(j = 0; j < ROTATOR_RELOAD; ++j) {
+
+ aVal = _mm256_loadu_ps((float*)aPtr);
+
+ yl = _mm256_moveldup_ps(phase_Val);
+ yh = _mm256_movehdup_ps(phase_Val);
+ ylp = _mm256_moveldup_ps(inc_Val);
+ yhp = _mm256_movehdup_ps(inc_Val);
+
+ tmp1 = _mm256_mul_ps(aVal, yl);
+ tmp1p = _mm256_mul_ps(phase_Val, ylp);
+
+ aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm256_mul_ps(aVal, yh);
+ tmp2p = _mm256_mul_ps(phase_Val, yhp);
+
+ z = _mm256_addsub_ps(tmp1, tmp2);
+ phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
+
+ _mm256_storeu_ps((float*)cPtr, z);
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+ tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
+ tmp2 = _mm256_hadd_ps(tmp1, tmp1);
+ tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
+ tmp2 = _mm256_sqrt_ps(tmp1);
+ phase_Val = _mm256_div_ps(phase_Val, tmp2);
+ }
+ for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
+ aVal = _mm256_loadu_ps((float*)aPtr);
+
+ yl = _mm256_moveldup_ps(phase_Val);
+ yh = _mm256_movehdup_ps(phase_Val);
+ ylp = _mm256_moveldup_ps(inc_Val);
+ yhp = _mm256_movehdup_ps(inc_Val);
+
+ tmp1 = _mm256_mul_ps(aVal, yl);
+
+ tmp1p = _mm256_mul_ps(phase_Val, ylp);
+
+ aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm256_mul_ps(aVal, yh);
+ tmp2p = _mm256_mul_ps(phase_Val, yhp);
+
+ z = _mm256_addsub_ps(tmp1, tmp2);
+ phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
+
+ _mm256_storeu_ps((float*)cPtr, z);
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+ if (i) {
+ tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
+ tmp2 = _mm256_hadd_ps(tmp1, tmp1);
+ tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
+ tmp2 = _mm256_sqrt_ps(tmp1);
+ phase_Val = _mm256_div_ps(phase_Val, tmp2);
+ }
+
+ _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
+ for(i = 0; i < num_points%4; ++i) {
+ *cPtr++ = *aPtr++ * phase_Ptr[0];
+ phase_Ptr[0] *= (phase_inc);
+ }
+
+ (*phase) = phase_Ptr[0];
+
+}
+
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_x2_conjugate_dot_prod_32fc
+ *
+ * \b Overview
+ *
+ * This block computes the conjugate dot product (or inner product)
+ * between two vectors, the \p input and \p taps vectors. Given a set
+ * of \p num_points taps, the result is the sum of products between
+ * the input vector and the conjugate of the taps. The result is a
+ * single value stored in the \p result address and is returned as a
+ * complex float.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_x2_conjugate_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li input: vector of complex floats.
+ * \li taps: complex float taps.
+ * \li num_points: number of samples in both \p input and \p taps.
+ *
+ * \b Outputs
+ * \li result: pointer to a complex float value to hold the dot product result.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * <FIXME>
+ *
+ * volk_32fc_x2_conjugate_dot_prod_32fc();
+ *
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
+#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
+
+
+#include<volk/volk_complex.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+
+ float * res = (float*) result;
+ float * in = (float*) input;
+ float * tp = (float*) taps;
+ unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+ unsigned int isodd = (num_bytes >> 3) &1;
+
+
+
+ float sum0[2] = {0,0};
+ float sum1[2] = {0,0};
+ unsigned int i = 0;
+
+
+ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+
+ sum0[0] += in[0] * tp[0] + in[1] * tp[1];
+ sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] + in[3] * tp[3];
+ sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
+
+
+ in += 4;
+ tp += 4;
+
+ }
+
+
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
+
+
+
+ for(i = 0; i < isodd; ++i) {
+
+
+ *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
+
+ }
+ /*
+ for(i = 0; i < num_bytes >> 3; ++i) {
+ *result += input[i] * conjf(taps[i]);
+ }
+ */
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+#include <mmintrin.h>
+
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int num_bytes = num_points*8;
+
+ // Variable never used?
+ //__VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+
+ union HalfMask {
+ uint32_t intRep[4];
+ __m128 vec;
+ } halfMask;
+
+ union NegMask {
+ int intRep[4];
+ __m128 vec;
+ } negMask;
+
+ unsigned int offset = 0;
+ float Rsum=0, Isum=0;
+ float Im,Re;
+
+ __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is;
+ __m128 zv = {0,0,0,0};
+
+ halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF;
+ halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000;
+
+ negMask.intRep[0] = negMask.intRep[2] = 0x80000000;
+ negMask.intRep[1] = negMask.intRep[3] = 0;
+
+ // main loop
+ while(num_bytes >= 4*sizeof(float)){
+
+ in1 = _mm_loadu_ps( (float*) (input+offset) );
+ in2 = _mm_loadu_ps( (float*) (taps+offset) );
+ Rv = _mm_mul_ps(in1, in2);
+ fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
+ Iv = _mm_mul_ps(in1, fehg);
+ Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv);
+ Ivm = _mm_xor_ps( negMask.vec, Iv );
+ Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv);
+ _mm_store_ss( &Im, Is );
+ _mm_store_ss( &Re, Rs );
+ num_bytes -= 4*sizeof(float);
+ offset += 2;
+ Rsum += Re;
+ Isum += Im;
+ }
+
+ // handle the last complex case ...
+ if(num_bytes > 0){
+
+ if(num_bytes != 4){
+ // bad things are happening
+ }
+
+ in1 = _mm_loadu_ps( (float*) (input+offset) );
+ in2 = _mm_loadu_ps( (float*) (taps+offset) );
+ Rv = _mm_and_ps(_mm_mul_ps(in1, in2), halfMask.vec);
+ fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
+ Iv = _mm_and_ps(_mm_mul_ps(in1, fehg), halfMask.vec);
+ Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv);
+ Ivm = _mm_xor_ps( negMask.vec, Iv );
+ Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv);
+ _mm_store_ss( &Im, Is );
+ _mm_store_ss( &Re, Rs );
+ Rsum += Re;
+ Isum += Im;
+ }
+
+ result[0] = lv_cmake(Rsum,Isum);
+ return;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+
+ lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
+ lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+ // for 2-lane vectors, 1st lane holds the real part,
+ // 2nd lane holds the imaginary part
+ float32x4x2_t a_val, b_val, accumulator;
+ float32x4x2_t tmp_imag;
+ accumulator.val[0] = vdupq_n_f32(0);
+ accumulator.val[1] = vdupq_n_f32(0);
+
+ for(number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ __VOLK_PREFETCH(a_ptr+8);
+ __VOLK_PREFETCH(b_ptr+8);
+
+ // do the first multiply
+ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+ tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+
+ // use multiply accumulate/subtract to get result
+ tmp_imag.val[1] = vmlsq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
+ tmp_imag.val[0] = vmlaq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
+
+ accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]);
+ accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]);
+
+ // increment pointers
+ a_ptr += 4;
+ b_ptr += 4;
+ }
+ lv_32fc_t accum_result[4];
+ vst2q_f32((float*)accum_result, accumulator);
+ *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+ // tail case
+ for(number = quarter_points*4; number < num_points; ++number) {
+ *result += (*a_ptr++) * lv_conj(*b_ptr++);
+ }
+ *result = lv_conj(*result);
+
+}
+#endif /*LV_HAVE_NEON*/
+
+#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/
+
+#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
+#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
+
+#include <volk/volk_common.h>
+#include<volk/volk_complex.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+
+ float * res = (float*) result;
+ float * in = (float*) input;
+ float * tp = (float*) taps;
+ unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+ unsigned int isodd = (num_bytes >> 3) &1;
+
+
+
+ float sum0[2] = {0,0};
+ float sum1[2] = {0,0};
+ unsigned int i = 0;
+
+
+ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+
+
+ sum0[0] += in[0] * tp[0] + in[1] * tp[1];
+ sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] + in[3] * tp[3];
+ sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
+
+
+ in += 4;
+ tp += 4;
+
+ }
+
+
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
+
+
+
+ for(i = 0; i < isodd; ++i) {
+
+
+ *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
+
+ }
+ /*
+ for(i = 0; i < num_bytes >> 3; ++i) {
+ *result += input[i] * conjf(taps[i]);
+ }
+ */
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+
+ __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+
+
+
+
+ asm volatile
+ (
+ "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
+ "# const float *taps, unsigned num_bytes)\n\t"
+ "# float sum0 = 0;\n\t"
+ "# float sum1 = 0;\n\t"
+ "# float sum2 = 0;\n\t"
+ "# float sum3 = 0;\n\t"
+ "# do {\n\t"
+ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+ "# input += 4;\n\t"
+ "# taps += 4; \n\t"
+ "# } while (--n_2_ccomplex_blocks != 0);\n\t"
+ "# result[0] = sum0 + sum2;\n\t"
+ "# result[1] = sum1 + sum3;\n\t"
+ "# TODO: prefetch and better scheduling\n\t"
+ " xor %%r9, %%r9\n\t"
+ " xor %%r10, %%r10\n\t"
+ " movq %[conjugator], %%r9\n\t"
+ " movq %%rcx, %%rax\n\t"
+ " movaps 0(%%r9), %%xmm8\n\t"
+ " movq %%rcx, %%r8\n\t"
+ " movq %[rsi], %%r9\n\t"
+ " movq %[rdx], %%r10\n\t"
+ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+ " movaps 0(%%r9), %%xmm0\n\t"
+ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+ " movups 0(%%r10), %%xmm2\n\t"
+ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
+ " shr $4, %%r8\n\t"
+ " xorps %%xmm8, %%xmm2\n\t"
+ " jmp .%=L1_test\n\t"
+ " # 4 taps / loop\n\t"
+ " # something like ?? cycles / loop\n\t"
+ ".%=Loop1: \n\t"
+ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+ "# movaps (%%r9), %%xmmA\n\t"
+ "# movaps (%%r10), %%xmmB\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+ "# mulps %%xmmB, %%xmmA\n\t"
+ "# mulps %%xmmZ, %%xmmB\n\t"
+ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+ "# xorps %%xmmPN, %%xmmA\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# unpcklps %%xmmB, %%xmmA\n\t"
+ "# unpckhps %%xmmB, %%xmmZ\n\t"
+ "# movaps %%xmmZ, %%xmmY\n\t"
+ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+ "# addps %%xmmZ, %%xmmA\n\t"
+ "# addps %%xmmA, %%xmmC\n\t"
+ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+ " movaps 16(%%r9), %%xmm1\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " movaps 16(%%r10), %%xmm3\n\t"
+ " movaps %%xmm1, %%xmm5\n\t"
+ " xorps %%xmm8, %%xmm3\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm3, %%xmm1\n\t"
+ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+ " addps %%xmm1, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " movaps 32(%%r9), %%xmm0\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ " mulps %%xmm5, %%xmm3\n\t"
+ " add $32, %%r9\n\t"
+ " movaps 32(%%r10), %%xmm2\n\t"
+ " addps %%xmm3, %%xmm7\n\t"
+ " add $32, %%r10\n\t"
+ " xorps %%xmm8, %%xmm2\n\t"
+ ".%=L1_test:\n\t"
+ " dec %%rax\n\t"
+ " jge .%=Loop1\n\t"
+ " # We've handled the bulk of multiplies up to here.\n\t"
+ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+ " # If so, we've got 2 more taps to do.\n\t"
+ " and $1, %%r8\n\t"
+ " je .%=Leven\n\t"
+ " # The count was odd, do 2 more taps.\n\t"
+ " # Note that we've already got mm0/mm2 preloaded\n\t"
+ " # from the main loop.\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ ".%=Leven:\n\t"
+ " # neg inversor\n\t"
+ " xorps %%xmm1, %%xmm1\n\t"
+ " mov $0x80000000, %%r9\n\t"
+ " movd %%r9, %%xmm1\n\t"
+ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+ " # pfpnacc\n\t"
+ " xorps %%xmm1, %%xmm6\n\t"
+ " movaps %%xmm6, %%xmm2\n\t"
+ " unpcklps %%xmm7, %%xmm6\n\t"
+ " unpckhps %%xmm7, %%xmm2\n\t"
+ " movaps %%xmm2, %%xmm3\n\t"
+ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+ " addps %%xmm2, %%xmm6\n\t"
+ " # xmm6 = r1 i2 r3 i4\n\t"
+ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
+ :
+ :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator)
+ :"rax", "r8", "r9", "r10"
+ );
+
+
+ int getem = num_bytes % 16;
+
+
+ for(; getem > 0; getem -= 8) {
+
+
+ *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
+
+ }
+
+ return;
+}
+#endif
+
+#if LV_HAVE_SSE && LV_HAVE_32
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+
+ __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+
+ int bound = num_bytes >> 4;
+ int leftovers = num_bytes % 16;
+
+
+ asm volatile
+ (
+ " #pushl %%ebp\n\t"
+ " #movl %%esp, %%ebp\n\t"
+ " #movl 12(%%ebp), %%eax # input\n\t"
+ " #movl 16(%%ebp), %%edx # taps\n\t"
+ " #movl 20(%%ebp), %%ecx # n_bytes\n\t"
+ " movaps 0(%[conjugator]), %%xmm1\n\t"
+ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+ " movaps 0(%[eax]), %%xmm0\n\t"
+ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+ " movaps 0(%[edx]), %%xmm2\n\t"
+ " movl %[ecx], (%[out])\n\t"
+ " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t"
+
+ " xorps %%xmm1, %%xmm2\n\t"
+ " jmp .%=L1_test\n\t"
+ " # 4 taps / loop\n\t"
+ " # something like ?? cycles / loop\n\t"
+ ".%=Loop1: \n\t"
+ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+ "# movaps (%[eax]), %%xmmA\n\t"
+ "# movaps (%[edx]), %%xmmB\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+ "# mulps %%xmmB, %%xmmA\n\t"
+ "# mulps %%xmmZ, %%xmmB\n\t"
+ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+ "# xorps %%xmmPN, %%xmmA\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# unpcklps %%xmmB, %%xmmA\n\t"
+ "# unpckhps %%xmmB, %%xmmZ\n\t"
+ "# movaps %%xmmZ, %%xmmY\n\t"
+ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+ "# addps %%xmmZ, %%xmmA\n\t"
+ "# addps %%xmmA, %%xmmC\n\t"
+ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+ " movaps 16(%[edx]), %%xmm3\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " xorps %%xmm1, %%xmm3\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " movaps 16(%[eax]), %%xmm1\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " movaps %%xmm1, %%xmm5\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm3, %%xmm1\n\t"
+ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+ " addps %%xmm1, %%xmm6\n\t"
+ " movaps 0(%[conjugator]), %%xmm1\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " movaps 32(%[eax]), %%xmm0\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ " mulps %%xmm5, %%xmm3\n\t"
+ " addl $32, %[eax]\n\t"
+ " movaps 32(%[edx]), %%xmm2\n\t"
+ " addps %%xmm3, %%xmm7\n\t"
+ " xorps %%xmm1, %%xmm2\n\t"
+ " addl $32, %[edx]\n\t"
+ ".%=L1_test:\n\t"
+ " decl %[ecx]\n\t"
+ " jge .%=Loop1\n\t"
+ " # We've handled the bulk of multiplies up to here.\n\t"
+ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+ " # If so, we've got 2 more taps to do.\n\t"
+ " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t"
+ " shrl $4, %[ecx]\n\t"
+ " andl $1, %[ecx]\n\t"
+ " je .%=Leven\n\t"
+ " # The count was odd, do 2 more taps.\n\t"
+ " # Note that we've already got mm0/mm2 preloaded\n\t"
+ " # from the main loop.\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ ".%=Leven:\n\t"
+ " # neg inversor\n\t"
+ " #movl 8(%%ebp), %[eax] \n\t"
+ " xorps %%xmm1, %%xmm1\n\t"
+ " movl $0x80000000, (%[out])\n\t"
+ " movss (%[out]), %%xmm1\n\t"
+ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+ " # pfpnacc\n\t"
+ " xorps %%xmm1, %%xmm6\n\t"
+ " movaps %%xmm6, %%xmm2\n\t"
+ " unpcklps %%xmm7, %%xmm6\n\t"
+ " unpckhps %%xmm7, %%xmm2\n\t"
+ " movaps %%xmm2, %%xmm3\n\t"
+ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+ " addps %%xmm2, %%xmm6\n\t"
+ " # xmm6 = r1 i2 r3 i4\n\t"
+ " #movl 8(%%ebp), %[eax] # @result\n\t"
+ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+ " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t"
+ " #popl %%ebp\n\t"
+ :
+ : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator)
+ );
+
+
+
+
+ printf("%d, %d\n", leftovers, bound);
+
+ for(; leftovers > 0; leftovers -= 8) {
+
+
+ *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
+
+ }
+
+ return;
+
+
+
+
+
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+
+#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2016 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_x2_divide_32fc
+ *
+ * \b Overview
+ *
+ * Divide first vector of complexes element-wise by second.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_x2_divide_32fc(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector, const lv_32fc_t* denumeratorVector, unsigned int num_points);
+ * \endcode
+ *
+ * \b Inputs
+ * \li numeratorVector: The numerator complex values.
+ * \li numeratorVector: The denumerator complex values.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li outputVector: The output vector complex floats.
+ *
+ * \b Example
+ * divide a complex vector by itself, demonstrating the result should be pretty close to 1+0j.
+ *
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * lv_32fc_t* input_vector = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *
+ * float delta = 2.f*M_PI / (float)N;
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * float real_1 = std::cos(0.3f * (float)ii);
+ * float imag_1 = std::sin(0.3f * (float)ii);
+ * input_vector[ii] = lv_cmake(real_1, imag_1);
+ * }
+ *
+ * volk_32fc_x2_divide_32fc(out, input_vector, input_vector, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("%1.4f%+1.4fj,", lv_creal(out[ii]), lv_cimag(out[ii]));
+ * }
+ * printf("\n");
+ *
+ * volk_free(input_vector);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
+#define INCLUDED_volk_32fc_x2_divide_32fc_u_H
+
+#include <inttypes.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void
+volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
+ const lv_32fc_t* denumeratorVector, unsigned int num_points)
+{
+ /*
+ * we'll do the "classical"
+ * a a b*
+ * --- = -------
+ * b |b|^2
+ * */
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 num01, num23, den01, den23, norm, result;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = numeratorVector;
+ const lv_32fc_t* b = denumeratorVector;
+
+ for(; number < quarterPoints; number++){
+ num01 = _mm_loadu_ps((float*) a); // first pair
+ den01 = _mm_loadu_ps((float*) b); // first pair
+ num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
+ a += 2;
+ b += 2;
+
+ num23 = _mm_loadu_ps((float*) a); // second pair
+ den23 = _mm_loadu_ps((float*) b); // second pair
+ num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
+ a += 2;
+ b += 2;
+
+ norm = _mm_magnitudesquared_ps_sse3(den01, den23);
+ den01 = _mm_unpacklo_ps(norm,norm);
+ den23 = _mm_unpackhi_ps(norm,norm);
+
+ result = _mm_div_ps(num01, den01);
+ _mm_storeu_ps((float*) c, result); // Store the results back into the C container
+ c += 2;
+ result = _mm_div_ps(num23, den23);
+ _mm_storeu_ps((float*) c, result); // Store the results back into the C container
+ c += 2;
+ }
+
+ number *= 4;
+ for(;number < num_points; number++){
+ *c = (*a) / (*b);
+ a++; b++; c++;
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void
+volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
+ const lv_32fc_t* denumeratorVector, unsigned int num_points)
+{
+ /*
+ * we'll do the "classical"
+ * a a b*
+ * --- = -------
+ * b |b|^2
+ * */
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = numeratorVector;
+ const lv_32fc_t* b = denumeratorVector;
+
+ for(; number < quarterPoints; number++){
+ num = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+ denum = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+ mul_conj = _mm256_complexconjugatemul_ps(num, denum);
+ sq = _mm256_mul_ps(denum, denum); // Square the values
+ mag_sq_un = _mm256_hadd_ps(sq,sq); // obtain the actual squared magnitude, although out of order
+ mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
+ // best guide I found on using these functions: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
+ div = _mm256_div_ps(mul_conj,mag_sq);
+
+ _mm256_storeu_ps((float*) c, div); // Store the results back into the C container
+
+ a += 4;
+ b += 4;
+ c += 4;
+ }
+
+ number = quarterPoints * 4;
+
+ for(; number < num_points; number++){
+ *c++ = (*a++) / (*b++);
+ }
+
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector, unsigned int num_points)
+{
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) / (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+#endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */
+
+
+#ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
+#define INCLUDED_volk_32fc_x2_divide_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void
+volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
+ const lv_32fc_t* denumeratorVector, unsigned int num_points)
+{
+ /*
+ * we'll do the "classical"
+ * a a b*
+ * --- = -------
+ * b |b|^2
+ * */
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 num01, num23, den01, den23, norm, result;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = numeratorVector;
+ const lv_32fc_t* b = denumeratorVector;
+
+ for(; number < quarterPoints; number++){
+ num01 = _mm_load_ps((float*) a); // first pair
+ den01 = _mm_load_ps((float*) b); // first pair
+ num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
+ a += 2;
+ b += 2;
+
+ num23 = _mm_load_ps((float*) a); // second pair
+ den23 = _mm_load_ps((float*) b); // second pair
+ num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
+ a += 2;
+ b += 2;
+
+ norm = _mm_magnitudesquared_ps_sse3(den01, den23);
+
+ den01 = _mm_unpacklo_ps(norm,norm); // select the lower floats twice
+ den23 = _mm_unpackhi_ps(norm,norm); // select the upper floats twice
+
+ result = _mm_div_ps(num01, den01);
+ _mm_store_ps((float*) c, result); // Store the results back into the C container
+ c += 2;
+ result = _mm_div_ps(num23, den23);
+ _mm_store_ps((float*) c, result); // Store the results back into the C container
+ c += 2;
+ }
+
+ number *= 4;
+ for(;number < num_points; number++){
+ *c = (*a) / (*b);
+ a++; b++; c++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void
+volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
+ const lv_32fc_t* denumeratorVector, unsigned int num_points)
+{
+ /*
+ * we'll do the "classical"
+ * a a b*
+ * --- = -------
+ * b |b|^2
+ * */
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = numeratorVector;
+ const lv_32fc_t* b = denumeratorVector;
+
+ for(; number < quarterPoints; number++){
+ num = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+ denum = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+ mul_conj = _mm256_complexconjugatemul_ps(num, denum);
+ sq = _mm256_mul_ps(denum, denum); // Square the values
+ mag_sq_un = _mm256_hadd_ps(sq,sq); // obtain the actual squared magnitude, although out of order
+ mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
+ // best guide I found on using these functions: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
+ div = _mm256_div_ps(mul_conj,mag_sq);
+
+ _mm256_store_ps((float*) c, div); // Store the results back into the C container
+
+ a += 4;
+ b += 4;
+ c += 4;
+ }
+
+ number = quarterPoints * 4;
+
+ for(; number < num_points; number++){
+ *c++ = (*a++) / (*b++);
+ }
+
+
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_x2_divide_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector, unsigned int num_points)
+{
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) / (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_divide_32fc_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_x2_dot_prod_32fc
+ *
+ * \b Overview
+ *
+ * This block computes the dot product (or inner product) between two
+ * vectors, the \p input and \p taps vectors. Given a set of \p
+ * num_points taps, the result is the sum of products between the two
+ * vectors. The result is a single value stored in the \p result
+ * address and is returned as a complex float.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_x2_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li input: vector of complex floats.
+ * \li taps: complex float taps.
+ * \li num_points: number of samples in both \p input and \p taps.
+ *
+ * \b Outputs
+ * \li result: pointer to a complex float value to hold the dot product result.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * <FIXME>
+ *
+ * volk_32fc_x2_dot_prod_32fc();
+ *
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
+#define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
+
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+#include <stdio.h>
+#include <string.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ float * res = (float*) result;
+ float * in = (float*) input;
+ float * tp = (float*) taps;
+ unsigned int n_2_ccomplex_blocks = num_points/2;
+ unsigned int isodd = num_points & 1;
+
+ float sum0[2] = {0,0};
+ float sum1[2] = {0,0};
+ unsigned int i = 0;
+
+ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+
+ in += 4;
+ tp += 4;
+ }
+
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
+
+ // Cleanup if we had an odd number of points
+ for(i = 0; i < isodd; ++i) {
+ *result += input[num_points - 1] * taps[num_points - 1];
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+static inline void volk_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+ unsigned int isodd = num_points & 1;
+
+ asm
+ (
+ "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
+ "# const float *taps, unsigned num_bytes)\n\t"
+ "# float sum0 = 0;\n\t"
+ "# float sum1 = 0;\n\t"
+ "# float sum2 = 0;\n\t"
+ "# float sum3 = 0;\n\t"
+ "# do {\n\t"
+ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+ "# input += 4;\n\t"
+ "# taps += 4; \n\t"
+ "# } while (--n_2_ccomplex_blocks != 0);\n\t"
+ "# result[0] = sum0 + sum2;\n\t"
+ "# result[1] = sum1 + sum3;\n\t"
+ "# TODO: prefetch and better scheduling\n\t"
+ " xor %%r9, %%r9\n\t"
+ " xor %%r10, %%r10\n\t"
+ " movq %%rcx, %%rax\n\t"
+ " movq %%rcx, %%r8\n\t"
+ " movq %[rsi], %%r9\n\t"
+ " movq %[rdx], %%r10\n\t"
+ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+ " movups 0(%%r9), %%xmm0\n\t"
+ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+ " movups 0(%%r10), %%xmm2\n\t"
+ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
+ " shr $4, %%r8\n\t"
+ " jmp .%=L1_test\n\t"
+ " # 4 taps / loop\n\t"
+ " # something like ?? cycles / loop\n\t"
+ ".%=Loop1: \n\t"
+ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+ "# movups (%%r9), %%xmmA\n\t"
+ "# movups (%%r10), %%xmmB\n\t"
+ "# movups %%xmmA, %%xmmZ\n\t"
+ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+ "# mulps %%xmmB, %%xmmA\n\t"
+ "# mulps %%xmmZ, %%xmmB\n\t"
+ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+ "# xorps %%xmmPN, %%xmmA\n\t"
+ "# movups %%xmmA, %%xmmZ\n\t"
+ "# unpcklps %%xmmB, %%xmmA\n\t"
+ "# unpckhps %%xmmB, %%xmmZ\n\t"
+ "# movups %%xmmZ, %%xmmY\n\t"
+ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+ "# addps %%xmmZ, %%xmmA\n\t"
+ "# addps %%xmmA, %%xmmC\n\t"
+ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+ " movups 16(%%r9), %%xmm1\n\t"
+ " movups %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " movups 16(%%r10), %%xmm3\n\t"
+ " movups %%xmm1, %%xmm5\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm3, %%xmm1\n\t"
+ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+ " addps %%xmm1, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " movups 32(%%r9), %%xmm0\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ " mulps %%xmm5, %%xmm3\n\t"
+ " add $32, %%r9\n\t"
+ " movups 32(%%r10), %%xmm2\n\t"
+ " addps %%xmm3, %%xmm7\n\t"
+ " add $32, %%r10\n\t"
+ ".%=L1_test:\n\t"
+ " dec %%rax\n\t"
+ " jge .%=Loop1\n\t"
+ " # We've handled the bulk of multiplies up to here.\n\t"
+ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+ " # If so, we've got 2 more taps to do.\n\t"
+ " and $1, %%r8\n\t"
+ " je .%=Leven\n\t"
+ " # The count was odd, do 2 more taps.\n\t"
+ " # Note that we've already got mm0/mm2 preloaded\n\t"
+ " # from the main loop.\n\t"
+ " movups %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ ".%=Leven:\n\t"
+ " # neg inversor\n\t"
+ " xorps %%xmm1, %%xmm1\n\t"
+ " mov $0x80000000, %%r9\n\t"
+ " movd %%r9, %%xmm1\n\t"
+ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+ " # pfpnacc\n\t"
+ " xorps %%xmm1, %%xmm6\n\t"
+ " movups %%xmm6, %%xmm2\n\t"
+ " unpcklps %%xmm7, %%xmm6\n\t"
+ " unpckhps %%xmm7, %%xmm2\n\t"
+ " movups %%xmm2, %%xmm3\n\t"
+ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+ " addps %%xmm2, %%xmm6\n\t"
+ " # xmm6 = r1 i2 r3 i4\n\t"
+ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
+ :
+ :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
+ :"rax", "r8", "r9", "r10"
+ );
+
+
+ if(isodd) {
+ *result += input[num_points - 1] * taps[num_points - 1];
+ }
+
+ return;
+
+}
+
+#endif /* LV_HAVE_SSE && LV_HAVE_64 */
+
+
+
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ lv_32fc_t dotProduct;
+ memset(&dotProduct, 0x0, 2*sizeof(float));
+
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points/2;
+ unsigned int isodd = num_points & 1;
+
+ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+ const lv_32fc_t* a = input;
+ const lv_32fc_t* b = taps;
+
+ dotProdVal = _mm_setzero_ps();
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+
+ a += 2;
+ b += 2;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
+
+ _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct += ( dotProductVector[0] + dotProductVector[1] );
+
+ if(isodd) {
+ dotProduct += input[num_points - 1] * taps[num_points - 1];
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int i = 0;
+ const unsigned int qtr_points = num_points/4;
+ const unsigned int isodd = num_points & 3;
+
+ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
+ float *p_input, *p_taps;
+ __m64 *p_result;
+
+ p_result = (__m64*)result;
+ p_input = (float*)input;
+ p_taps = (float*)taps;
+
+ static const __m128i neg = {0x000000000000000080000000};
+
+ real0 = _mm_setzero_ps();
+ real1 = _mm_setzero_ps();
+ im0 = _mm_setzero_ps();
+ im1 = _mm_setzero_ps();
+
+ for(; i < qtr_points; ++i) {
+ xmm0 = _mm_loadu_ps(p_input);
+ xmm1 = _mm_loadu_ps(p_taps);
+
+ p_input += 4;
+ p_taps += 4;
+
+ xmm2 = _mm_loadu_ps(p_input);
+ xmm3 = _mm_loadu_ps(p_taps);
+
+ p_input += 4;
+ p_taps += 4;
+
+ xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
+ xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
+ xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
+ xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
+
+ //imaginary vector from input
+ xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
+ //real vector from input
+ xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
+ //imaginary vector from taps
+ xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
+ //real vector from taps
+ xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
+
+ xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
+ xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
+
+ xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
+ xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
+
+ real0 = _mm_add_ps(xmm4, real0);
+ real1 = _mm_add_ps(xmm5, real1);
+ im0 = _mm_add_ps(xmm6, im0);
+ im1 = _mm_add_ps(xmm7, im1);
+ }
+
+ real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
+
+ im0 = _mm_add_ps(im0, im1);
+ real0 = _mm_add_ps(real0, real1);
+
+ im0 = _mm_add_ps(im0, real0);
+
+ _mm_storel_pi(p_result, im0);
+
+ for(i = num_points-isodd; i < num_points; i++) {
+ *result += input[i] * taps[i];
+ }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int isodd = num_points & 3;
+ unsigned int i = 0;
+ lv_32fc_t dotProduct;
+ memset(&dotProduct, 0x0, 2*sizeof(float));
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+ const lv_32fc_t* a = input;
+ const lv_32fc_t* b = taps;
+
+ dotProdVal = _mm256_setzero_ps();
+
+ for(;number < quarterPoints; number++){
+ x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
+ y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
+
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
+
+ tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+
+ x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
+
+ tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+
+ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex multiplication results together
+
+ a += 4;
+ b += 4;
+ }
+
+ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
+
+ _mm256_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct += ( dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + dotProductVector[3]);
+
+ for(i = num_points-isodd; i < num_points; i++) {
+ dotProduct += input[i] * taps[i];
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_AVX*/
+
+
+#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/
+
+#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
+#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
+
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+#include <stdio.h>
+#include <string.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+
+ float * res = (float*) result;
+ float * in = (float*) input;
+ float * tp = (float*) taps;
+ unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+ unsigned int isodd = num_points & 1;
+
+ float sum0[2] = {0,0};
+ float sum1[2] = {0,0};
+ unsigned int i = 0;
+
+ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+
+ in += 4;
+ tp += 4;
+ }
+
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
+
+ for(i = 0; i < isodd; ++i) {
+ *result += input[num_points - 1] * taps[num_points - 1];
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+ unsigned int isodd = num_points & 1;
+
+ asm
+ (
+ "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
+ "# const float *taps, unsigned num_bytes)\n\t"
+ "# float sum0 = 0;\n\t"
+ "# float sum1 = 0;\n\t"
+ "# float sum2 = 0;\n\t"
+ "# float sum3 = 0;\n\t"
+ "# do {\n\t"
+ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+ "# input += 4;\n\t"
+ "# taps += 4; \n\t"
+ "# } while (--n_2_ccomplex_blocks != 0);\n\t"
+ "# result[0] = sum0 + sum2;\n\t"
+ "# result[1] = sum1 + sum3;\n\t"
+ "# TODO: prefetch and better scheduling\n\t"
+ " xor %%r9, %%r9\n\t"
+ " xor %%r10, %%r10\n\t"
+ " movq %%rcx, %%rax\n\t"
+ " movq %%rcx, %%r8\n\t"
+ " movq %[rsi], %%r9\n\t"
+ " movq %[rdx], %%r10\n\t"
+ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+ " movaps 0(%%r9), %%xmm0\n\t"
+ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+ " movaps 0(%%r10), %%xmm2\n\t"
+ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
+ " shr $4, %%r8\n\t"
+ " jmp .%=L1_test\n\t"
+ " # 4 taps / loop\n\t"
+ " # something like ?? cycles / loop\n\t"
+ ".%=Loop1: \n\t"
+ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+ "# movaps (%%r9), %%xmmA\n\t"
+ "# movaps (%%r10), %%xmmB\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+ "# mulps %%xmmB, %%xmmA\n\t"
+ "# mulps %%xmmZ, %%xmmB\n\t"
+ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+ "# xorps %%xmmPN, %%xmmA\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# unpcklps %%xmmB, %%xmmA\n\t"
+ "# unpckhps %%xmmB, %%xmmZ\n\t"
+ "# movaps %%xmmZ, %%xmmY\n\t"
+ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+ "# addps %%xmmZ, %%xmmA\n\t"
+ "# addps %%xmmA, %%xmmC\n\t"
+ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+ " movaps 16(%%r9), %%xmm1\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " movaps 16(%%r10), %%xmm3\n\t"
+ " movaps %%xmm1, %%xmm5\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm3, %%xmm1\n\t"
+ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+ " addps %%xmm1, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " movaps 32(%%r9), %%xmm0\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ " mulps %%xmm5, %%xmm3\n\t"
+ " add $32, %%r9\n\t"
+ " movaps 32(%%r10), %%xmm2\n\t"
+ " addps %%xmm3, %%xmm7\n\t"
+ " add $32, %%r10\n\t"
+ ".%=L1_test:\n\t"
+ " dec %%rax\n\t"
+ " jge .%=Loop1\n\t"
+ " # We've handled the bulk of multiplies up to here.\n\t"
+ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+ " # If so, we've got 2 more taps to do.\n\t"
+ " and $1, %%r8\n\t"
+ " je .%=Leven\n\t"
+ " # The count was odd, do 2 more taps.\n\t"
+ " # Note that we've already got mm0/mm2 preloaded\n\t"
+ " # from the main loop.\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ ".%=Leven:\n\t"
+ " # neg inversor\n\t"
+ " xorps %%xmm1, %%xmm1\n\t"
+ " mov $0x80000000, %%r9\n\t"
+ " movd %%r9, %%xmm1\n\t"
+ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+ " # pfpnacc\n\t"
+ " xorps %%xmm1, %%xmm6\n\t"
+ " movaps %%xmm6, %%xmm2\n\t"
+ " unpcklps %%xmm7, %%xmm6\n\t"
+ " unpckhps %%xmm7, %%xmm2\n\t"
+ " movaps %%xmm2, %%xmm3\n\t"
+ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+ " addps %%xmm2, %%xmm6\n\t"
+ " # xmm6 = r1 i2 r3 i4\n\t"
+ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
+ :
+ :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
+ :"rax", "r8", "r9", "r10"
+ );
+
+
+ if(isodd) {
+ *result += input[num_points - 1] * taps[num_points - 1];
+ }
+
+ return;
+
+}
+
+#endif
+
+#if LV_HAVE_SSE && LV_HAVE_32
+
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points);
+
+#if 0
+ const unsigned int num_bytes = num_points*8;
+ unsigned int isodd = num_points & 1;
+
+ asm volatile
+ (
+ " #pushl %%ebp\n\t"
+ " #movl %%esp, %%ebp\n\t"
+ " movl 12(%%ebp), %%eax # input\n\t"
+ " movl 16(%%ebp), %%edx # taps\n\t"
+ " movl 20(%%ebp), %%ecx # n_bytes\n\t"
+ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+ " movaps 0(%%eax), %%xmm0\n\t"
+ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+ " movaps 0(%%edx), %%xmm2\n\t"
+ " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t"
+ " jmp .%=L1_test\n\t"
+ " # 4 taps / loop\n\t"
+ " # something like ?? cycles / loop\n\t"
+ ".%=Loop1: \n\t"
+ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+ "# movaps (%%eax), %%xmmA\n\t"
+ "# movaps (%%edx), %%xmmB\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+ "# mulps %%xmmB, %%xmmA\n\t"
+ "# mulps %%xmmZ, %%xmmB\n\t"
+ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+ "# xorps %%xmmPN, %%xmmA\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# unpcklps %%xmmB, %%xmmA\n\t"
+ "# unpckhps %%xmmB, %%xmmZ\n\t"
+ "# movaps %%xmmZ, %%xmmY\n\t"
+ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+ "# addps %%xmmZ, %%xmmA\n\t"
+ "# addps %%xmmA, %%xmmC\n\t"
+ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+ " movaps 16(%%eax), %%xmm1\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " movaps 16(%%edx), %%xmm3\n\t"
+ " movaps %%xmm1, %%xmm5\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm3, %%xmm1\n\t"
+ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+ " addps %%xmm1, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " movaps 32(%%eax), %%xmm0\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ " mulps %%xmm5, %%xmm3\n\t"
+ " addl $32, %%eax\n\t"
+ " movaps 32(%%edx), %%xmm2\n\t"
+ " addps %%xmm3, %%xmm7\n\t"
+ " addl $32, %%edx\n\t"
+ ".%=L1_test:\n\t"
+ " decl %%ecx\n\t"
+ " jge .%=Loop1\n\t"
+ " # We've handled the bulk of multiplies up to here.\n\t"
+ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+ " # If so, we've got 2 more taps to do.\n\t"
+ " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t"
+ " shrl $4, %%ecx\n\t"
+ " andl $1, %%ecx\n\t"
+ " je .%=Leven\n\t"
+ " # The count was odd, do 2 more taps.\n\t"
+ " # Note that we've already got mm0/mm2 preloaded\n\t"
+ " # from the main loop.\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ ".%=Leven:\n\t"
+ " # neg inversor\n\t"
+ " movl 8(%%ebp), %%eax \n\t"
+ " xorps %%xmm1, %%xmm1\n\t"
+ " movl $0x80000000, (%%eax)\n\t"
+ " movss (%%eax), %%xmm1\n\t"
+ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+ " # pfpnacc\n\t"
+ " xorps %%xmm1, %%xmm6\n\t"
+ " movaps %%xmm6, %%xmm2\n\t"
+ " unpcklps %%xmm7, %%xmm6\n\t"
+ " unpckhps %%xmm7, %%xmm2\n\t"
+ " movaps %%xmm2, %%xmm3\n\t"
+ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+ " addps %%xmm2, %%xmm6\n\t"
+ " # xmm6 = r1 i2 r3 i4\n\t"
+ " #movl 8(%%ebp), %%eax # @result\n\t"
+ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+ " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t"
+ " #popl %%ebp\n\t"
+ :
+ :
+ : "eax", "ecx", "edx"
+ );
+
+
+ int getem = num_bytes % 16;
+
+ if(isodd) {
+ *result += (input[num_points - 1] * taps[num_points - 1]);
+ }
+
+ return;
+#endif
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+ unsigned int isodd = num_points & 1;
+
+ lv_32fc_t dotProduct;
+ memset(&dotProduct, 0x0, 2*sizeof(float));
+
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_bytes >> 4;
+
+ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+ const lv_32fc_t* a = input;
+ const lv_32fc_t* b = taps;
+
+ dotProdVal = _mm_setzero_ps();
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+
+ a += 2;
+ b += 2;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
+
+ _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct += ( dotProductVector[0] + dotProductVector[1] );
+
+ if(isodd) {
+ dotProduct += input[num_points - 1] * taps[num_points - 1];
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+
+#ifdef LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int i = 0;
+ const unsigned int qtr_points = num_points/4;
+ const unsigned int isodd = num_points & 3;
+
+ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
+ float *p_input, *p_taps;
+ __m64 *p_result;
+
+ static const __m128i neg = {0x000000000000000080000000};
+
+ p_result = (__m64*)result;
+ p_input = (float*)input;
+ p_taps = (float*)taps;
+
+ real0 = _mm_setzero_ps();
+ real1 = _mm_setzero_ps();
+ im0 = _mm_setzero_ps();
+ im1 = _mm_setzero_ps();
+
+ for(; i < qtr_points; ++i) {
+ xmm0 = _mm_load_ps(p_input);
+ xmm1 = _mm_load_ps(p_taps);
+
+ p_input += 4;
+ p_taps += 4;
+
+ xmm2 = _mm_load_ps(p_input);
+ xmm3 = _mm_load_ps(p_taps);
+
+ p_input += 4;
+ p_taps += 4;
+
+ xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
+ xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
+ xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
+ xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
+
+ //imaginary vector from input
+ xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
+ //real vector from input
+ xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
+ //imaginary vector from taps
+ xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
+ //real vector from taps
+ xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
+
+ xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
+ xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
+
+ xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
+ xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
+
+ real0 = _mm_add_ps(xmm4, real0);
+ real1 = _mm_add_ps(xmm5, real1);
+ im0 = _mm_add_ps(xmm6, im0);
+ im1 = _mm_add_ps(xmm7, im1);
+ }
+
+ real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
+
+ im0 = _mm_add_ps(im0, im1);
+ real0 = _mm_add_ps(real0, real1);
+
+ im0 = _mm_add_ps(im0, real0);
+
+ _mm_storel_pi(p_result, im0);
+
+ for(i = num_points-isodd; i < num_points; i++) {
+ *result += input[i] * taps[i];
+ }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+
+ lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
+ lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+ // for 2-lane vectors, 1st lane holds the real part,
+ // 2nd lane holds the imaginary part
+ float32x4x2_t a_val, b_val, c_val, accumulator;
+ float32x4x2_t tmp_real, tmp_imag;
+ accumulator.val[0] = vdupq_n_f32(0);
+ accumulator.val[1] = vdupq_n_f32(0);
+
+ for(number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ __VOLK_PREFETCH(a_ptr+8);
+ __VOLK_PREFETCH(b_ptr+8);
+
+ // multiply the real*real and imag*imag to get real result
+ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+ tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+ // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+ tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
+
+ // Multiply cross terms to get the imaginary result
+ // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+ tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
+ // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+
+ c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
+ c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
+
+ accumulator.val[0] = vaddq_f32(accumulator.val[0], c_val.val[0]);
+ accumulator.val[1] = vaddq_f32(accumulator.val[1], c_val.val[1]);
+
+ a_ptr += 4;
+ b_ptr += 4;
+ }
+ lv_32fc_t accum_result[4];
+ vst2q_f32((float*)accum_result, accumulator);
+ *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+ // tail case
+ for(number = quarter_points*4; number < num_points; ++number) {
+ *result += (*a_ptr++) * (*b_ptr++);
+ }
+
+}
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+
+ lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
+ lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+ // for 2-lane vectors, 1st lane holds the real part,
+ // 2nd lane holds the imaginary part
+ float32x4x2_t a_val, b_val, accumulator;
+ float32x4x2_t tmp_imag;
+ accumulator.val[0] = vdupq_n_f32(0);
+ accumulator.val[1] = vdupq_n_f32(0);
+
+ for(number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ __VOLK_PREFETCH(a_ptr+8);
+ __VOLK_PREFETCH(b_ptr+8);
+
+ // do the first multiply
+ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+ tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+
+ // use multiply accumulate/subtract to get result
+ tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
+ tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
+
+ accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]);
+ accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]);
+
+ // increment pointers
+ a_ptr += 4;
+ b_ptr += 4;
+ }
+ lv_32fc_t accum_result[4];
+ vst2q_f32((float*)accum_result, accumulator);
+ *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+ // tail case
+ for(number = quarter_points*4; number < num_points; ++number) {
+ *result += (*a_ptr++) * (*b_ptr++);
+ }
+
+}
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_NEON
+static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+
+ lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
+ lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+ // for 2-lane vectors, 1st lane holds the real part,
+ // 2nd lane holds the imaginary part
+ float32x4x2_t a_val, b_val, accumulator1, accumulator2;
+ accumulator1.val[0] = vdupq_n_f32(0);
+ accumulator1.val[1] = vdupq_n_f32(0);
+ accumulator2.val[0] = vdupq_n_f32(0);
+ accumulator2.val[1] = vdupq_n_f32(0);
+
+ for(number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ __VOLK_PREFETCH(a_ptr+8);
+ __VOLK_PREFETCH(b_ptr+8);
+
+ // use 2 accumulators to remove inter-instruction data dependencies
+ accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
+ accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
+ accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
+ accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
+ // increment pointers
+ a_ptr += 4;
+ b_ptr += 4;
+ }
+ accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
+ accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
+ lv_32fc_t accum_result[4];
+ vst2q_f32((float*)accum_result, accumulator1);
+ *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+ // tail case
+ for(number = quarter_points*4; number < num_points; ++number) {
+ *result += (*a_ptr++) * (*b_ptr++);
+ }
+
+}
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_NEON
+static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+// NOTE: GCC does a poor job with this kernel, but the euivalent ASM code is very fast
+
+ unsigned int quarter_points = num_points / 8;
+ unsigned int number;
+
+ lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
+ lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+ // for 2-lane vectors, 1st lane holds the real part,
+ // 2nd lane holds the imaginary part
+ float32x4x4_t a_val, b_val, accumulator1, accumulator2;
+ float32x4x2_t reduced_accumulator;
+ accumulator1.val[0] = vdupq_n_f32(0);
+ accumulator1.val[1] = vdupq_n_f32(0);
+ accumulator1.val[2] = vdupq_n_f32(0);
+ accumulator1.val[3] = vdupq_n_f32(0);
+ accumulator2.val[0] = vdupq_n_f32(0);
+ accumulator2.val[1] = vdupq_n_f32(0);
+ accumulator2.val[2] = vdupq_n_f32(0);
+ accumulator2.val[3] = vdupq_n_f32(0);
+
+ // 8 input regs, 8 accumulators -> 16/16 neon regs are used
+ for(number = 0; number < quarter_points; ++number) {
+ a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ __VOLK_PREFETCH(a_ptr+8);
+ __VOLK_PREFETCH(b_ptr+8);
+
+ // use 2 accumulators to remove inter-instruction data dependencies
+ accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
+ accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
+
+ accumulator1.val[2] = vmlaq_f32(accumulator1.val[2], a_val.val[2], b_val.val[2]);
+ accumulator1.val[3] = vmlaq_f32(accumulator1.val[3], a_val.val[2], b_val.val[3]);
+
+ accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
+ accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
+
+ accumulator2.val[2] = vmlsq_f32(accumulator2.val[2], a_val.val[3], b_val.val[3]);
+ accumulator2.val[3] = vmlaq_f32(accumulator2.val[3], a_val.val[3], b_val.val[2]);
+ // increment pointers
+ a_ptr += 8;
+ b_ptr += 8;
+ }
+ // reduce 8 accumulator lanes down to 2 (1 real and 1 imag)
+ accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator1.val[2]);
+ accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator1.val[3]);
+ accumulator2.val[0] = vaddq_f32(accumulator2.val[0], accumulator2.val[2]);
+ accumulator2.val[1] = vaddq_f32(accumulator2.val[1], accumulator2.val[3]);
+ reduced_accumulator.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
+ reduced_accumulator.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
+ // now reduce accumulators to scalars
+ lv_32fc_t accum_result[4];
+ vst2q_f32((float*)accum_result, reduced_accumulator);
+ *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+ // tail case
+ for(number = quarter_points*8; number < num_points; ++number) {
+ *result += (*a_ptr++) * (*b_ptr++);
+ }
+
+}
+#endif /*LV_HAVE_NEON*/
+
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int isodd = num_points & 3;
+ unsigned int i = 0;
+ lv_32fc_t dotProduct;
+ memset(&dotProduct, 0x0, 2*sizeof(float));
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+ const lv_32fc_t* a = input;
+ const lv_32fc_t* b = taps;
+
+ dotProdVal = _mm256_setzero_ps();
+
+ for(;number < quarterPoints; number++){
+
+ x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
+ y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
+
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
+
+ tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+
+ x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
+
+ tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+
+ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex multiplication results together
+
+ a += 4;
+ b += 4;
+ }
+
+ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
+
+ _mm256_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct += ( dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + dotProductVector[3]);
+
+ for(i = num_points-isodd; i < num_points; i++) {
+ dotProduct += input[i] * taps[i];
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_AVX*/
+
+#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_x2_multiply_32fc
+ *
+ * \b Overview
+ *
+ * Multiplies two complex vectors and returns the complex result.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_x2_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The first input vector of complex floats.
+ * \li bVector: The second input vector of complex floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li outputVector: The output vector complex floats.
+ *
+ * \b Example
+ * Mix two signals at f=0.3 and 0.1.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * lv_32fc_t* sig_1 = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * lv_32fc_t* sig_2 = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * // Generate two tones
+ * float real_1 = std::cos(0.3f * (float)ii);
+ * float imag_1 = std::sin(0.3f * (float)ii);
+ * sig_1[ii] = lv_cmake(real_1, imag_1);
+ * float real_2 = std::cos(0.1f * (float)ii);
+ * float imag_2 = std::sin(0.1f * (float)ii);
+ * sig_2[ii] = lv_cmake(real_2, imag_2);
+ * }
+ *
+ * volk_32fc_x2_multiply_32fc(out, sig_1, sig_2, N);
+ * *
+ * volk_free(sig_1);
+ * volk_free(sig_2);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
+#define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+/*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ for(;number < quarterPoints; number++){
+
+ const __m256 x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ const __m256 y = _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ const __m256 tmp2x = _mm256_permute_ps(x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ const __m256 z = _mm256_fmaddsub_ps(x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm256_storeu_ps((float*)c,z); // Store the results back into the C container
+
+ a += 4;
+ b += 4;
+ c += 4;
+ }
+
+ _mm256_zeroupper();
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *c++ = (*a++) * (*b++);
+ }
+}
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void
+volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m256 x, y, z;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ for(; number < quarterPoints; number++){
+ x = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+ y = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+ z = _mm256_complexmul_ps(x, y);
+ _mm256_storeu_ps((float*) c, z); // Store the results back into the C container
+
+ a += 4;
+ b += 4;
+ c += 4;
+ }
+
+ number = quarterPoints * 4;
+
+ for(; number < num_points; number++){
+ *c++ = (*a++) * (*b++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void
+volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, z;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ for(; number < halfPoints; number++){
+ x = _mm_loadu_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_loadu_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di
+ z = _mm_complexmul_ps(x, y);
+ _mm_storeu_ps((float*) c, z); // Store the results back into the C container
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0){
+ *c = (*a) * (*b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector, unsigned int num_points)
+{
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
+#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H
+#define INCLUDED_volk_32fc_x2_multiply_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+/*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ for(;number < quarterPoints; number++){
+
+ const __m256 x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ const __m256 y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ const __m256 tmp2x = _mm256_permute_ps(x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ const __m256 z = _mm256_fmaddsub_ps(x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm256_store_ps((float*)c,z); // Store the results back into the C container
+
+ a += 4;
+ b += 4;
+ c += 4;
+ }
+
+ _mm256_zeroupper();
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *c++ = (*a++) * (*b++);
+ }
+}
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void
+volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m256 x, y, z;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ for(; number < quarterPoints; number++){
+ x = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+ y = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+ z = _mm256_complexmul_ps(x, y);
+ _mm256_store_ps((float*) c, z); // Store the results back into the C container
+
+ a += 4;
+ b += 4;
+ c += 4;
+ }
+
+ number = quarterPoints * 4;
+
+ for(; number < num_points; number++){
+ *c++ = (*a++) * (*b++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void
+volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, z;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ for(; number < halfPoints; number++){
+ x = _mm_load_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_load_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di
+ z = _mm_complexmul_ps(x, y);
+ _mm_store_ps((float*) c, z); // Store the results back into the C container
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0){
+ *c = (*a) * (*b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector, unsigned int num_points)
+{
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector, unsigned int num_points)
+{
+ lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
+ lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
+ unsigned int quarter_points = num_points / 4;
+ float32x4x2_t a_val, b_val, c_val;
+ float32x4x2_t tmp_real, tmp_imag;
+ unsigned int number = 0;
+
+ for(number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ __VOLK_PREFETCH(a_ptr+4);
+ __VOLK_PREFETCH(b_ptr+4);
+
+ // multiply the real*real and imag*imag to get real result
+ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+ tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+ // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+ tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
+
+ // Multiply cross terms to get the imaginary result
+ // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+ tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
+ // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+
+ // store the results
+ c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
+ c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
+ vst2q_f32((float*)cVector, c_val);
+
+ a_ptr += 4;
+ b_ptr += 4;
+ cVector += 4;
+ }
+
+ for(number = quarter_points*4; number < num_points; number++){
+ *cVector++ = (*a_ptr++) * (*b_ptr++);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_NEON
+
+static inline void
+volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector, unsigned int num_points)
+{
+ lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
+ lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
+ unsigned int quarter_points = num_points / 4;
+ float32x4x2_t a_val, b_val;
+ float32x4x2_t tmp_imag;
+ unsigned int number = 0;
+
+ for(number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ __VOLK_PREFETCH(a_ptr+4);
+ __VOLK_PREFETCH(b_ptr+4);
+
+ // do the first multiply
+ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+ tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+
+ // use multiply accumulate/subtract to get result
+ tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
+ tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
+
+ // store
+ vst2q_f32((float*)cVector, tmp_imag);
+ // increment pointers
+ a_ptr += 4;
+ b_ptr += 4;
+ cVector += 4;
+ }
+
+ for(number = quarter_points*4; number < num_points; number++){
+ *cVector++ = (*a_ptr++) * (*b_ptr++);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_NEON
+
+extern void
+volk_32fc_x2_multiply_32fc_neonasm(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector, unsigned int num_points);
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_ORC
+
+extern void
+volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector, unsigned int num_points);
+
+static inline void
+volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector, unsigned int num_points)
+{
+ volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+
+#endif /* LV_HAVE_ORC */
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_x2_multiply_conjugate_32fc
+ *
+ * \b Overview
+ *
+ * Multiplies a complex vector by the conjugate of a secod complex
+ * vector and returns the complex result.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_x2_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The first input vector of complex floats.
+ * \li bVector: The second input vector of complex floats that is conjugated.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li outputVector: The output vector complex floats.
+ *
+ * \b Example
+ * Calculate mag^2 of a signal using x * conj(x).
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * lv_32fc_t* sig_1 = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *
+ * float delta = 2.f*M_PI / (float)N;
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * float real_1 = std::cos(0.3f * (float)ii);
+ * float imag_1 = std::sin(0.3f * (float)ii);
+ * sig_1[ii] = lv_cmake(real_1, imag_1);
+ * }
+ *
+ * volk_32fc_x2_multiply_conjugate_32fc(out, sig_1, sig_1, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("%1.4f%+1.4fj,", lv_creal(out[ii]), lv_cimag(out[ii]));
+ * }
+ * printf("\n");
+ *
+ * volk_free(sig_1);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
+#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void
+volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m256 x, y, z;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ for(; number < quarterPoints; number++){
+ x = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+ y = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+ z = _mm256_complexconjugatemul_ps(x, y);
+ _mm256_storeu_ps((float*) c, z); // Store the results back into the C container
+
+ a += 4;
+ b += 4;
+ c += 4;
+ }
+
+ number = quarterPoints * 4;
+
+ for(; number < num_points; number++){
+ *c++ = (*a++) * lv_conj(*b++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void
+volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, z;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ for(; number < halfPoints; number++){
+ x = _mm_loadu_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_loadu_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di
+ z = _mm_complexconjugatemul_ps(x, y);
+ _mm_storeu_ps((float*) c, z); // Store the results back into the C container
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0){
+ *c = (*a) * lv_conj(*b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector, unsigned int num_points)
+{
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */
+#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
+#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void
+volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m256 x, y, z;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ for(; number < quarterPoints; number++){
+ x = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+ y = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+ z = _mm256_complexconjugatemul_ps(x, y);
+ _mm256_store_ps((float*) c, z); // Store the results back into the C container
+
+ a += 4;
+ b += 4;
+ c += 4;
+ }
+
+ number = quarterPoints * 4;
+
+ for(; number < num_points; number++){
+ *c++ = (*a++) * lv_conj(*b++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void
+volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, z;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ for(; number < halfPoints; number++){
+ x = _mm_load_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_load_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di
+ z = _mm_complexconjugatemul_ps(x, y);
+ _mm_store_ps((float*) c, z); // Store the results back into the C container
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0){
+ *c = (*a) * lv_conj(*b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector, unsigned int num_points)
+{
+ lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
+ lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
+ unsigned int quarter_points = num_points / 4;
+ float32x4x2_t a_val, b_val, c_val;
+ float32x4x2_t tmp_real, tmp_imag;
+ unsigned int number = 0;
+
+ for(number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ b_val.val[1] = vnegq_f32(b_val.val[1]);
+ __VOLK_PREFETCH(a_ptr+4);
+ __VOLK_PREFETCH(b_ptr+4);
+
+ // multiply the real*real and imag*imag to get real result
+ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+ tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+ // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+ tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
+
+ // Multiply cross terms to get the imaginary result
+ // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+ tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
+ // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+
+ // store the results
+ c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
+ c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
+ vst2q_f32((float*)cVector, c_val);
+
+ a_ptr += 4;
+ b_ptr += 4;
+ cVector += 4;
+ }
+
+ for(number = quarter_points*4; number < num_points; number++){
+ *cVector++ = (*a_ptr++) * conj(*b_ptr++);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector, unsigned int num_points)
+{
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_x2_s32f_square_dist_scalar_mult_32f
+ *
+ * \b Overview
+ *
+ * Calculates the square distance between a single complex input for each
+ * point in a complex vector scaled by a scalar value.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_x2_s32f_square_dist_scalar_mult_32f(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li src0: The complex input. Only the first point is used.
+ * \li points: A complex vector of reference points.
+ * \li scalar: A float to scale the distances by
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li target: A vector of distances between src0 and the vector of points.
+ *
+ * \b Example
+ * Calculate the distance between an input and reference points in a square
+ * 16-qam constellation. Normalize distances by the area of the constellation.
+ * \code
+ * int N = 16;
+ * unsigned int alignment = volk_get_alignment();
+ * lv_32fc_t* constellation = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * lv_32fc_t* rx = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float const_vals[] = {-3, -1, 1, 3};
+ *
+ * unsigned int jj = 0;
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * constellation[ii] = lv_cmake(const_vals[ii%4], const_vals[jj]);
+ * if((ii+1)%4 == 0) ++jj;
+ * }
+ *
+ * *rx = lv_cmake(0.5f, 2.f);
+ * float scale = 1.f/64.f; // 1 / constellation area
+ *
+ * volk_32fc_x2_s32f_square_dist_scalar_mult_32f(out, rx, constellation, scale, N);
+ *
+ * printf("Distance from each constellation point:\n");
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("%.4f ", out[ii]);
+ * if((ii+1)%4 == 0) printf("\n");
+ * }
+ *
+ * volk_free(rx);
+ * volk_free(constellation);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
+#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
+
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+#include <string.h>
+
+#ifdef LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+static inline void
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points,
+ float scalar, unsigned int num_points)
+{
+ const unsigned int num_bytes = num_points*8;
+
+ __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+
+ lv_32fc_t diff;
+ memset(&diff, 0x0, 2*sizeof(float));
+
+ float sq_dist = 0.0;
+ int bound = num_bytes >> 5;
+ int leftovers0 = (num_bytes >> 4) & 1;
+ int leftovers1 = (num_bytes >> 3) & 1;
+ int i = 0;
+
+ xmm1 = _mm_setzero_ps();
+ xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
+ xmm2 = _mm_load_ps((float*)&points[0]);
+ xmm8 = _mm_load1_ps(&scalar);
+ xmm1 = _mm_movelh_ps(xmm1, xmm1);
+ xmm3 = _mm_load_ps((float*)&points[2]);
+
+ for(; i < bound - 1; ++i) {
+ xmm4 = _mm_sub_ps(xmm1, xmm2);
+ xmm5 = _mm_sub_ps(xmm1, xmm3);
+ points += 4;
+ xmm6 = _mm_mul_ps(xmm4, xmm4);
+ xmm7 = _mm_mul_ps(xmm5, xmm5);
+
+ xmm2 = _mm_load_ps((float*)&points[0]);
+
+ xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+ xmm3 = _mm_load_ps((float*)&points[2]);
+
+ xmm4 = _mm_mul_ps(xmm4, xmm8);
+
+ _mm_store_ps(target, xmm4);
+
+ target += 4;
+ }
+
+ xmm4 = _mm_sub_ps(xmm1, xmm2);
+ xmm5 = _mm_sub_ps(xmm1, xmm3);
+
+ points += 4;
+ xmm6 = _mm_mul_ps(xmm4, xmm4);
+ xmm7 = _mm_mul_ps(xmm5, xmm5);
+
+ xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+ xmm4 = _mm_mul_ps(xmm4, xmm8);
+
+ _mm_store_ps(target, xmm4);
+
+ target += 4;
+
+ for(i = 0; i < leftovers0; ++i) {
+ xmm2 = _mm_load_ps((float*)&points[0]);
+
+ xmm4 = _mm_sub_ps(xmm1, xmm2);
+
+ points += 2;
+
+ xmm6 = _mm_mul_ps(xmm4, xmm4);
+
+ xmm4 = _mm_hadd_ps(xmm6, xmm6);
+
+ xmm4 = _mm_mul_ps(xmm4, xmm8);
+
+ _mm_storeh_pi((__m64*)target, xmm4);
+
+ target += 2;
+ }
+
+ for(i = 0; i < leftovers1; ++i) {
+
+ diff = src0[0] - points[0];
+
+ sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
+
+ target[0] = sq_dist;
+ }
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points,
+ float scalar, unsigned int num_points)
+{
+ const unsigned int num_bytes = num_points*8;
+
+ lv_32fc_t diff;
+ float sq_dist;
+ unsigned int i = 0;
+
+ for(; i < num_bytes >> 3; ++i) {
+ diff = src0[0] - points[i];
+
+ sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
+
+ target[i] = sq_dist;
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_x2_square_dist_32f
+ *
+ * \b Overview
+ *
+ * Calculates the square distance between a single complex input for each
+ * point in a complex vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_x2_square_dist_32f(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) {
+ * \endcode
+ *
+ * \b Inputs
+ * \li src0: The complex input. Only the first point is used.
+ * \li points: A complex vector of reference points.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li target: A vector of distances between src0 and the vector of points.
+ *
+ * \b Example
+ * Calculate the distance between an input and reference points in a square
+ * 16-qam constellation.
+ * \code
+ * int N = 16;
+ * unsigned int alignment = volk_get_alignment();
+ * lv_32fc_t* constellation = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * lv_32fc_t* rx = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float const_vals[] = {-3, -1, 1, 3};
+ *
+ * // Generate 16-QAM constellation points
+ * unsigned int jj = 0;
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * constellation[ii] = lv_cmake(const_vals[ii%4], const_vals[jj]);
+ * if((ii+1)%4 == 0) ++jj;
+ * }
+ *
+ * *rx = lv_cmake(0.5f, 2.f);
+ *
+ * volk_32fc_x2_square_dist_32f(out, rx, constellation, N);
+ *
+ * printf("Distance from each constellation point:\n");
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("%.4f ", out[ii]);
+ * if((ii+1)%4 == 0) printf("\n");
+ * }
+ *
+ * volk_free(rx);
+ * volk_free(constellation);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
+#define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
+
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+
+#ifdef LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+static inline void
+volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points,
+ unsigned int num_points)
+{
+ const unsigned int num_bytes = num_points*8;
+
+ __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+ lv_32fc_t diff;
+ float sq_dist;
+ int bound = num_bytes >> 5;
+ int leftovers0 = (num_bytes >> 4) & 1;
+ int leftovers1 = (num_bytes >> 3) & 1;
+ int i = 0;
+
+ xmm1 = _mm_setzero_ps();
+ xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
+ xmm2 = _mm_load_ps((float*)&points[0]);
+ xmm1 = _mm_movelh_ps(xmm1, xmm1);
+ xmm3 = _mm_load_ps((float*)&points[2]);
+
+ for(; i < bound - 1; ++i) {
+ xmm4 = _mm_sub_ps(xmm1, xmm2);
+ xmm5 = _mm_sub_ps(xmm1, xmm3);
+ points += 4;
+ xmm6 = _mm_mul_ps(xmm4, xmm4);
+ xmm7 = _mm_mul_ps(xmm5, xmm5);
+
+ xmm2 = _mm_load_ps((float*)&points[0]);
+
+ xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+ xmm3 = _mm_load_ps((float*)&points[2]);
+
+ _mm_store_ps(target, xmm4);
+
+ target += 4;
+ }
+
+ xmm4 = _mm_sub_ps(xmm1, xmm2);
+ xmm5 = _mm_sub_ps(xmm1, xmm3);
+
+ points += 4;
+ xmm6 = _mm_mul_ps(xmm4, xmm4);
+ xmm7 = _mm_mul_ps(xmm5, xmm5);
+
+ xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+ _mm_store_ps(target, xmm4);
+
+ target += 4;
+
+ for(i = 0; i < leftovers0; ++i) {
+
+ xmm2 = _mm_load_ps((float*)&points[0]);
+
+ xmm4 = _mm_sub_ps(xmm1, xmm2);
+
+ points += 2;
+
+ xmm6 = _mm_mul_ps(xmm4, xmm4);
+
+ xmm4 = _mm_hadd_ps(xmm6, xmm6);
+
+ _mm_storeh_pi((__m64*)target, xmm4);
+
+ target += 2;
+ }
+
+ for(i = 0; i < leftovers1; ++i) {
+
+ diff = src0[0] - points[0];
+
+ sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+
+ target[0] = sq_dist;
+ }
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void
+volk_32fc_x2_square_dist_32f_neon(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points)
+{
+ const unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+
+ float32x4x2_t a_vec, b_vec;
+ float32x4x2_t diff_vec;
+ float32x4_t tmp, tmp1, dist_sq;
+ a_vec.val[0] = vdupq_n_f32( lv_creal(src0[0]) );
+ a_vec.val[1] = vdupq_n_f32( lv_cimag(src0[0]) );
+ for(number=0; number < quarter_points; ++number) {
+ b_vec = vld2q_f32((float*)points);
+ diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
+ diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
+ tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
+ tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
+
+ dist_sq = vaddq_f32(tmp, tmp1);
+ vst1q_f32(target, dist_sq);
+ points += 4;
+ target += 4;
+ }
+ for(number=quarter_points*4; number < num_points; ++number) {
+ lv_32fc_t diff = src0[0] - *points++;
+ *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_32fc_x2_square_dist_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points,
+ unsigned int num_points)
+{
+ const unsigned int num_bytes = num_points*8;
+
+ lv_32fc_t diff;
+ float sq_dist;
+ unsigned int i = 0;
+
+ for(; i < num_bytes >> 3; ++i) {
+ diff = src0[0] - points[i];
+
+ sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+
+ target[i] = sq_dist;
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32i_s32f_convert_32f
+ *
+ * \b Overview
+ *
+ * Converts the samples in the inputVector from 32-bit integers into
+ * floating point values and then divides them by the input scalar.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32i_s32f_convert_32f(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The vector of 32-bit integers.
+ * \li scalar: The value that the output is divided by after being converted to a float.
+ * \li num_points: The number of values.
+ *
+ * \b Outputs
+ * \li complexVector: The output vector of floats.
+ *
+ * \b Example
+ * Convert full-range integers to floats in range [0,1].
+ * \code
+ * int N = 1<<8;
+ * unsigned int alignment = volk_get_alignment();
+ *
+ * int32_t* x = (int32_t*)volk_malloc(N*sizeof(int32_t), alignment);
+ * float* z = (float*)volk_malloc(N*sizeof(float), alignment);
+ * float scale = (float)N;
+ * for(unsigned int ii=0; ii<N; ++ii){
+ * x[ii] = ii;
+ * }
+ *
+ * volk_32i_s32f_convert_32f(z, x, scale, N);
+ *
+ * volk_free(x);
+ * volk_free(z);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32i_s32f_convert_32f_u_H
+#define INCLUDED_volk_32i_s32f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void
+volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const int32_t* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ int32_t* inputPtr = (int32_t*)inputVector;
+ __m128i inputVal;
+ __m128 ret;
+
+ for(;number < quarterPoints; number++){
+ // Load the 4 values
+ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+ inputPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) * iScalar;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32i_s32f_convert_32f_generic(float* outputVector, const int32_t* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ float* outputVectorPtr = outputVector;
+ const int32_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ const float iScalar = 1.0 / scalar;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32i_s32f_convert_32f_u_H */
+
+
+
+#ifndef INCLUDED_volk_32i_s32f_convert_32f_a_H
+#define INCLUDED_volk_32i_s32f_convert_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void
+volk_32i_s32f_convert_32f_a_sse2(float* outputVector, const int32_t* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ int32_t* inputPtr = (int32_t*)inputVector;
+ __m128i inputVal;
+ __m128 ret;
+
+ for(;number < quarterPoints; number++){
+ // Load the 4 values
+ inputVal = _mm_load_si128((__m128i*)inputPtr);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+
+ _mm_store_ps(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+ inputPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) * iScalar;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32i_s32f_convert_32f_a_generic(float* outputVector, const int32_t* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ float* outputVectorPtr = outputVector;
+ const int32_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ const float iScalar = 1.0 / scalar;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32i_s32f_convert_32f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32i_x2_and_32i
+ *
+ * \b Overview
+ *
+ * Computes the Boolean AND operation between two input 32-bit integer vectors.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32i_x2_and_32i(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: Input vector of samples.
+ * \li bVector: Input vector of samples.
+ * \li num_points: The number of values.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * This example generates a Karnaugh map for the lower two bits of x AND y.
+ * \code
+ * int N = 1<<4;
+ * unsigned int alignment = volk_get_alignment();
+ *
+ * int32_t* x = (int32_t*)volk_malloc(N*sizeof(int32_t), alignment);
+ * int32_t* y = (int32_t*)volk_malloc(N*sizeof(int32_t), alignment);
+ * int32_t* z = (int32_t*)volk_malloc(N*sizeof(int32_t), alignment);
+ * int32_t in_seq[] = {0,1,3,2};
+ * unsigned int jj=0;
+ * for(unsigned int ii=0; ii<N; ++ii){
+ * x[ii] = in_seq[ii%4];
+ * y[ii] = in_seq[jj];
+ * if(((ii+1) % 4) == 0) jj++;
+ * }
+ *
+ * volk_32i_x2_and_32i(z, x, y, N);
+ *
+ * printf("Karnaugh map for x AND y\n");
+ * printf("y\\x|");
+ * for(unsigned int ii=0; ii<4; ++ii){
+ * printf(" %.2x ", in_seq[ii]);
+ * }
+ * printf("\n---|---------------\n");
+ * jj = 0;
+ * for(unsigned int ii=0; ii<N; ++ii){
+ * if(((ii+1) % 4) == 1){
+ * printf("%.2x | ", in_seq[jj++]);
+ * }
+ * printf("%.2x ", z[ii]);
+ * if(!((ii+1) % 4)){
+ * printf("\n");
+ * }
+ * }
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32i_x2_and_32i_a_H
+#define INCLUDED_volk_32i_x2_and_32i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32i_x2_and_32i_a_sse(int32_t* cVector, const int32_t* aVector,
+ const int32_t* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = (float*)cVector;
+ const float* aPtr = (float*)aVector;
+ const float* bPtr = (float*)bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_and_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ cVector[number] = aVector[number] & bVector[number];
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32i_x2_and_32i_neon(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points)
+{
+ int32_t* cPtr = cVector;
+ const int32_t* aPtr = aVector;
+ const int32_t* bPtr= bVector;
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+
+ int32x4_t a_val, b_val, c_val;
+
+ for(number = 0; number < quarter_points; number++){
+ a_val = vld1q_s32(aPtr);
+ b_val = vld1q_s32(bPtr);
+ c_val = vandq_s32(a_val, b_val);
+ vst1q_s32(cPtr, c_val);
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ for(number = quarter_points * 4; number < num_points; number++){
+ *cPtr++ = (*aPtr++) & (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32i_x2_and_32i_generic(int32_t* cVector, const int32_t* aVector,
+ const int32_t* bVector, unsigned int num_points)
+{
+ int32_t* cPtr = cVector;
+ const int32_t* aPtr = aVector;
+ const int32_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) & (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_ORC
+extern void
+volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector,
+ const int32_t* bVector, unsigned int num_points);
+
+static inline void
+volk_32i_x2_and_32i_u_orc(int32_t* cVector, const int32_t* aVector,
+ const int32_t* bVector, unsigned int num_points)
+{
+ volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32i_x2_and_32i_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32i_x2_or_32i
+ *
+ * \b Overview
+ *
+ * Computes the Boolean OR operation between two input 32-bit integer vectors.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32i_x2_or_32i(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: Input vector of samples.
+ * \li bVector: Input vector of samples.
+ * \li num_points: The number of values.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * This example generates a Karnaugh map for the first two bits of x OR y
+ * \code
+ * int N = 1<<4;
+ * unsigned int alignment = volk_get_alignment();
+ *
+ * int32_t* x = (int32_t*)volk_malloc(N*sizeof(int32_t), alignment);
+ * int32_t* y = (int32_t*)volk_malloc(N*sizeof(int32_t), alignment);
+ * int32_t* z = (int32_t*)volk_malloc(N*sizeof(int32_t), alignment);
+ * int32_t in_seq[] = {0,1,3,2};
+ * unsigned int jj=0;
+ * for(unsigned int ii=0; ii<N; ++ii){
+ * x[ii] = in_seq[ii%4];
+ * y[ii] = in_seq[jj];
+ * if(((ii+1) % 4) == 0) jj++;
+ * }
+ *
+ * volk_32i_x2_or_32i(z, x, y, N);
+ *
+ * printf("Karnaugh map for x OR y\n");
+ * printf("y\\x|");
+ * for(unsigned int ii=0; ii<4; ++ii){
+ * printf(" %.2x ", in_seq[ii]);
+ * }
+ * printf("\n---|---------------\n");
+ * jj = 0;
+ * for(unsigned int ii=0; ii<N; ++ii){
+ * if(((ii+1) % 4) == 1){
+ * printf("%.2x | ", in_seq[jj++]);
+ * }
+ * printf("%.2x ", z[ii]);
+ * if(!((ii+1) % 4)){
+ * printf("\n");
+ * }
+ * }
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32i_x2_or_32i_a_H
+#define INCLUDED_volk_32i_x2_or_32i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVector,
+ const int32_t* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = (float*)cVector;
+ const float* aPtr = (float*)aVector;
+ const float* bPtr = (float*)bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_or_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ cVector[number] = aVector[number] | bVector[number];
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32i_x2_or_32i_neon(int32_t* cVector, const int32_t* aVector,
+ const int32_t* bVector, unsigned int num_points)
+{
+ int32_t* cPtr = cVector;
+ const int32_t* aPtr = aVector;
+ const int32_t* bPtr= bVector;
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+
+ int32x4_t a_val, b_val, c_val;
+
+ for(number = 0; number < quarter_points; number++){
+ a_val = vld1q_s32(aPtr);
+ b_val = vld1q_s32(bPtr);
+ c_val = vorrq_s32(a_val, b_val);
+ vst1q_s32(cPtr, c_val);
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ for(number = quarter_points * 4; number < num_points; number++){
+ *cPtr++ = (*aPtr++) | (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32i_x2_or_32i_generic(int32_t* cVector, const int32_t* aVector,
+ const int32_t* bVector, unsigned int num_points)
+{
+ int32_t* cPtr = cVector;
+ const int32_t* aPtr = aVector;
+ const int32_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) | (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_ORC
+extern void
+volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector,
+ const int32_t* bVector, unsigned int num_points);
+
+static inline void
+volk_32i_x2_or_32i_u_orc(int32_t* cVector, const int32_t* aVector,
+ const int32_t* bVector, unsigned int num_points)
+{
+ volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32i_x2_or_32i_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32u_byteswap
+ *
+ * \b Overview
+ *
+ * Byteswaps (in-place) an aligned vector of int32_t's.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32u_byteswap(uint32_t* intsToSwap, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li intsToSwap: The vector of data to byte swap.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li intsToSwap: returns as an in-place calculation.
+ *
+ * \b Example
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ *
+ * uint32_t bitstring[] = {0x0, 0x1, 0xf, 0xffffffff,
+ * 0x5a5a5a5a, 0xa5a5a5a5, 0x2a2a2a2a,
+ * 0xffffffff, 0x32, 0x64};
+ * uint32_t hamming_distance = 0;
+ *
+ * printf("byteswap vector =\n");
+ * for(unsigned int ii=0; ii<N; ++ii){
+ * printf(" %.8x\n", bitstring[ii]);
+ * }
+ *
+ * volk_32u_byteswap(bitstring, N);
+ *
+ * printf("byteswapped vector =\n");
+ * for(unsigned int ii=0; ii<N; ++ii){
+ * printf(" %.8x\n", bitstring[ii]);
+ * }
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32u_byteswap_u_H
+#define INCLUDED_volk_32u_byteswap_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_AVX2
+#include <immintrin.h>
+static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap, unsigned int num_points){
+
+ unsigned int number;
+
+ const unsigned int nPerSet = 8;
+ const uint64_t nSets = num_points / nPerSet;
+
+ uint32_t* inputPtr = intsToSwap;
+
+ const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
+
+ const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector);
+
+ for (number = 0 ;number < nSets; number++) {
+
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
+ const __m256i output = _mm256_shuffle_epi8(input,myShuffle);
+
+ // Store the results
+ _mm256_storeu_si256((__m256i*)inputPtr, output);
+ inputPtr += nPerSet;
+ }
+ _mm256_zeroupper();
+
+ // Byteswap any remaining points:
+ for(number = nSets * nPerSet; number < num_points; number++){
+ uint32_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points){
+ unsigned int number = 0;
+
+ uint32_t* inputPtr = intsToSwap;
+ __m128i input, byte1, byte2, byte3, byte4, output;
+ __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+ __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+
+ const uint64_t quarterPoints = num_points / 4;
+ for(;number < quarterPoints; number++){
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_loadu_si128((__m128i*)inputPtr);
+ // Do the four shifts
+ byte1 = _mm_slli_epi32(input, 24);
+ byte2 = _mm_slli_epi32(input, 8);
+ byte3 = _mm_srli_epi32(input, 8);
+ byte4 = _mm_srli_epi32(input, 24);
+ // Or bytes together
+ output = _mm_or_si128(byte1, byte4);
+ byte2 = _mm_and_si128(byte2, byte2mask);
+ output = _mm_or_si128(output, byte2);
+ byte3 = _mm_and_si128(byte3, byte3mask);
+ output = _mm_or_si128(output, byte3);
+ // Store the results
+ _mm_storeu_si128((__m128i*)inputPtr, output);
+ inputPtr += 4;
+ }
+
+ // Byteswap any remaining points:
+ number = quarterPoints*4;
+ for(; number < num_points; number++){
+ uint32_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32u_byteswap_neon(uint32_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = intsToSwap;
+ unsigned int number = 0;
+ unsigned int n8points = num_points / 8;
+
+ uint8x8x4_t input_table;
+ uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
+ uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
+
+ /* these magic numbers are used as byte-indeces in the LUT.
+ they are pre-computed to save time. A simple C program
+ can calculate them; for example for lookup01:
+ uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
+ for(ii=0; ii < 8; ++ii) {
+ index += ((uint64_t)(*(chars+ii))) << (ii*8);
+ }
+ */
+ int_lookup01 = vcreate_u8(74609667900706840);
+ int_lookup23 = vcreate_u8(219290013576860186);
+ int_lookup45 = vcreate_u8(363970359253013532);
+ int_lookup67 = vcreate_u8(508650704929166878);
+
+ for(number = 0; number < n8points; ++number){
+ input_table = vld4_u8((uint8_t*) inputPtr);
+ swapped_int01 = vtbl4_u8(input_table, int_lookup01);
+ swapped_int23 = vtbl4_u8(input_table, int_lookup23);
+ swapped_int45 = vtbl4_u8(input_table, int_lookup45);
+ swapped_int67 = vtbl4_u8(input_table, int_lookup67);
+ vst1_u8((uint8_t*) inputPtr, swapped_int01);
+ vst1_u8((uint8_t*) (inputPtr+2), swapped_int23);
+ vst1_u8((uint8_t*) (inputPtr+4), swapped_int45);
+ vst1_u8((uint8_t*) (inputPtr+6), swapped_int67);
+
+ inputPtr += 8;
+ }
+
+ for(number = n8points * 8; number < num_points; ++number){
+ uint32_t output = *inputPtr;
+ output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+
+ *inputPtr = output;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = intsToSwap;
+
+ unsigned int point;
+ for(point = 0; point < num_points; point++){
+ uint32_t output = *inputPtr;
+ output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+
+ *inputPtr = output;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32u_byteswap_u_H */
+#ifndef INCLUDED_volk_32u_byteswap_a_H
+#define INCLUDED_volk_32u_byteswap_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+
+#if LV_HAVE_AVX2
+#include <immintrin.h>
+static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap, unsigned int num_points){
+
+ unsigned int number;
+
+ const unsigned int nPerSet = 8;
+ const uint64_t nSets = num_points / nPerSet;
+
+ uint32_t* inputPtr = intsToSwap;
+
+ const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
+
+ const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector);
+
+ for (number = 0 ;number < nSets; number++) {
+
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
+ const __m256i output = _mm256_shuffle_epi8(input,myShuffle);
+
+ // Store the results
+ _mm256_store_si256((__m256i*)inputPtr, output);
+ inputPtr += nPerSet;
+ }
+ _mm256_zeroupper();
+
+ // Byteswap any remaining points:
+ for(number = nSets * nPerSet; number < num_points; number++){
+ uint32_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+
+static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int num_points){
+ unsigned int number = 0;
+
+ uint32_t* inputPtr = intsToSwap;
+ __m128i input, byte1, byte2, byte3, byte4, output;
+ __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+ __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+
+ const uint64_t quarterPoints = num_points / 4;
+ for(;number < quarterPoints; number++){
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_load_si128((__m128i*)inputPtr);
+ // Do the four shifts
+ byte1 = _mm_slli_epi32(input, 24);
+ byte2 = _mm_slli_epi32(input, 8);
+ byte3 = _mm_srli_epi32(input, 8);
+ byte4 = _mm_srli_epi32(input, 24);
+ // Or bytes together
+ output = _mm_or_si128(byte1, byte4);
+ byte2 = _mm_and_si128(byte2, byte2mask);
+ output = _mm_or_si128(output, byte2);
+ byte3 = _mm_and_si128(byte3, byte3mask);
+ output = _mm_or_si128(output, byte3);
+ // Store the results
+ _mm_store_si128((__m128i*)inputPtr, output);
+ inputPtr += 4;
+ }
+
+ // Byteswap any remaining points:
+ number = quarterPoints*4;
+ for(; number < num_points; number++){
+ uint32_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = intsToSwap;
+
+ unsigned int point;
+ for(point = 0; point < num_points; point++){
+ uint32_t output = *inputPtr;
+ output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+
+ *inputPtr = output;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32u_byteswap_a_H */
--- /dev/null
+#ifndef INCLUDED_volk_32u_byteswappuppet_32u_H
+#define INCLUDED_volk_32u_byteswappuppet_32u_H
+
+#include <volk/volk_32u_byteswap.h>
+#include <stdint.h>
+#include <string.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32u_byteswappuppet_32u_generic(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){
+
+ volk_32u_byteswap_generic((uint32_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_NEON
+static inline void volk_32u_byteswappuppet_32u_neon(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){
+
+ volk_32u_byteswap_neon((uint32_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_32u_byteswappuppet_32u_u_sse2(uint32_t *output, uint32_t* intsToSwap, unsigned int num_points){
+
+ volk_32u_byteswap_u_sse2((uint32_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_32u_byteswappuppet_32u_a_sse2(uint32_t* output, uint32_t* intsToSwap, unsigned int num_points){
+
+ volk_32u_byteswap_a_sse2((uint32_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_AVX2
+static inline void volk_32u_byteswappuppet_32u_u_avx2(uint32_t* output, uint32_t* intsToSwap, unsigned int num_points){
+
+ volk_32u_byteswap_u_avx2((uint32_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_AVX2
+static inline void volk_32u_byteswappuppet_32u_a_avx2(uint32_t* output, uint32_t* intsToSwap, unsigned int num_points){
+
+ volk_32u_byteswap_a_avx2((uint32_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+
+}
+#endif
+
+#endif
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32u_popcnt
+ *
+ * \b Overview
+ *
+ * Computes the population count (popcnt), or Hamming distance of a
+ * binary string. This kernel takes in a single unsigned 32-bit value
+ * and returns the count of 1's that the value contains.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32u_popcnt(uint32_t* ret, const uint32_t value)
+ * \endcode
+ *
+ * \b Inputs
+ * \li value: The input value.
+ *
+ * \b Outputs
+ * \li ret: The return value containing the popcnt.
+ *
+ * \b Example
+ * \code
+ int N = 10;
+ unsigned int alignment = volk_get_alignment();
+
+ uint32_t bitstring = 0x55555555;
+ uint32_t hamming_distance = 0;
+
+ volk_32u_popcnt(&hamming_distance, bitstring);
+ printf("hamming distance of %x = %i\n", bitstring, hamming_distance);
+ * \endcode
+ */
+
+#ifndef INCLUDED_VOLK_32u_POPCNT_A16_H
+#define INCLUDED_VOLK_32u_POPCNT_A16_H
+
+#include <stdio.h>
+#include <inttypes.h>
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value)
+{
+ // This is faster than a lookup table
+ uint32_t retVal = value;
+
+ retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+ retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+ retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+ retVal = (retVal + (retVal >> 8));
+ retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+
+ *ret = retVal;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE4_2
+
+#include <nmmintrin.h>
+
+static inline void
+volk_32u_popcnt_a_sse4_2(uint32_t* ret, const uint32_t value)
+{
+ *ret = _mm_popcnt_u32(value);
+}
+
+#endif /*LV_HAVE_SSE4_2*/
+
+#endif /*INCLUDED_VOLK_32u_POPCNT_A16_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_volk_32u_popcntpuppet_32u_H
+#define INCLUDED_volk_32u_popcntpuppet_32u_H
+
+#include <stdint.h>
+#include <volk/volk_32u_popcnt.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32u_popcntpuppet_32u_generic(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points){
+ unsigned int ii;
+ for(ii=0; ii < num_points; ++ii) {
+ volk_32u_popcnt_generic(outVector+ii, *(inVector+ii) );
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_SSE4_2
+static inline void volk_32u_popcntpuppet_32u_a_sse4_2(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points){
+ unsigned int ii;
+ for(ii=0; ii < num_points; ++ii) {
+ volk_32u_popcnt_a_sse4_2(outVector+ii, *(inVector+ii) );
+ }
+}
+#endif /* LV_HAVE_SSE4_2 */
+
+#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_64f_convert_32f
+ *
+ * \b Overview
+ *
+ * Converts doubles into floats.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_64f_convert_32f(float* outputVector, const double* inputVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The vector of doubles to convert to floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li outputVector: returns the converted floats.
+ *
+ * \b Example
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * double* increasing = (double*)volk_malloc(sizeof(double)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * increasing[ii] = (double)ii;
+ * }
+ *
+ * volk_64f_convert_32f(out, increasing, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out[%u] = %1.2f\n", ii, out[ii]);
+ * }
+ *
+ * volk_free(increasing);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_64f_convert_32f_u_H
+#define INCLUDED_volk_64f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const double* inputVectorPtr = (const double*)inputVector;
+ float* outputVectorPtr = outputVector;
+ __m128 ret, ret2;
+ __m128d inputVal1, inputVal2;
+
+ for(;number < quarterPoints; number++){
+ inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
+ inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
+
+ ret = _mm_cvtpd_ps(inputVal1);
+ ret2 = _mm_cvtpd_ps(inputVal2);
+
+ ret = _mm_movelh_ps(ret, ret2);
+
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_64f_convert_32f_generic(float* outputVector, const double* inputVector, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const double* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_64f_convert_32f_u_H */
+#ifndef INCLUDED_volk_64f_convert_32f_a_H
+#define INCLUDED_volk_64f_convert_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const double* inputVectorPtr = (const double*)inputVector;
+ float* outputVectorPtr = outputVector;
+ __m128 ret, ret2;
+ __m128d inputVal1, inputVal2;
+
+ for(;number < quarterPoints; number++){
+ inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
+ inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
+
+ ret = _mm_cvtpd_ps(inputVal1);
+ ret2 = _mm_cvtpd_ps(inputVal2);
+
+ ret = _mm_movelh_ps(ret, ret2);
+
+ _mm_store_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_64f_convert_32f_a_generic(float* outputVector, const double* inputVector, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const double* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_64f_convert_32f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_64f_x2_max_64f
+ *
+ * \b Overview
+ *
+ * Selects maximum value from each entry between bVector and aVector
+ * and store their results in the cVector.
+ *
+ * c[i] = max(a[i], b[i])
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_64f_x2_max_64f(double* cVector, const double* aVector, const double* bVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: First input vector.
+ * \li bVector: Second input vector.
+ * \li num_points: The number of values in both input vectors.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * double* increasing = (double*)volk_malloc(sizeof(double)*N, alignment);
+ * double* decreasing = (double*)volk_malloc(sizeof(double)*N, alignment);
+ * double* out = (double*)volk_malloc(sizeof(double)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * increasing[ii] = (double)ii;
+ * decreasing[ii] = 10.f - (double)ii;
+ * }
+ *
+ * volk_64f_x2_max_64f(out, increasing, decreasing, N);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out[%u] = %1.2g\n", ii, out[ii]);
+ * }
+ *
+ * volk_free(increasing);
+ * volk_free(decreasing);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_64f_x2_max_64f_a_H
+#define INCLUDED_volk_64f_x2_max_64f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void
+volk_64f_x2_max_64f_a_sse2(double* cVector, const double* aVector,
+ const double* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr= bVector;
+
+ __m128d aVal, bVal, cVal;
+ for(;number < halfPoints; number++){
+
+ aVal = _mm_load_pd(aPtr);
+ bVal = _mm_load_pd(bPtr);
+
+ cVal = _mm_max_pd(aVal, bVal);
+
+ _mm_store_pd(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 2;
+ bPtr += 2;
+ cPtr += 2;
+ }
+
+ number = halfPoints * 2;
+ for(;number < num_points; number++){
+ const double a = *aPtr++;
+ const double b = *bPtr++;
+ *cPtr++ = ( a > b ? a : b);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_64f_x2_max_64f_generic(double* cVector, const double* aVector,
+ const double* bVector, unsigned int num_points)
+{
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ const double a = *aPtr++;
+ const double b = *bPtr++;
+ *cPtr++ = ( a > b ? a : b);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_64f_x2_max_64f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_64f_x2_min_64f
+ *
+ * \b Overview
+ *
+ * Selects minimum value from each entry between bVector and aVector
+ * and store their results in the cVector.
+ *
+ * c[i] = min(a[i], b[i])
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_64f_x2_min_64f(double* cVector, const double* aVector, const double* bVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: First input vector.
+ * \li bVector: Second input vector.
+ * \li num_points: The number of values in both input vectors.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * \code
+ int N = 10;
+ unsigned int alignment = volk_get_alignment();
+ double* increasing = (double*)volk_malloc(sizeof(double)*N, alignment);
+ double* decreasing = (double*)volk_malloc(sizeof(double)*N, alignment);
+ double* out = (double*)volk_malloc(sizeof(double)*N, alignment);
+
+ for(unsigned int ii = 0; ii < N; ++ii){
+ increasing[ii] = (double)ii;
+ decreasing[ii] = 10.f - (double)ii;
+ }
+
+ volk_64f_x2_min_64f(out, increasing, decreasing, N);
+
+ for(unsigned int ii = 0; ii < N; ++ii){
+ printf("out[%u] = %1.2g\n", ii, out[ii]);
+ }
+
+ volk_free(increasing);
+ volk_free(decreasing);
+ volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_64f_x2_min_64f_a_H
+#define INCLUDED_volk_64f_x2_min_64f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void
+volk_64f_x2_min_64f_a_sse2(double* cVector, const double* aVector,
+ const double* bVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr= bVector;
+
+ __m128d aVal, bVal, cVal;
+ for(;number < halfPoints; number++){
+
+ aVal = _mm_load_pd(aPtr);
+ bVal = _mm_load_pd(bPtr);
+
+ cVal = _mm_min_pd(aVal, bVal);
+
+ _mm_store_pd(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 2;
+ bPtr += 2;
+ cPtr += 2;
+ }
+
+ number = halfPoints * 2;
+ for(;number < num_points; number++){
+ const double a = *aPtr++;
+ const double b = *bPtr++;
+ *cPtr++ = ( a < b ? a : b);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_64f_x2_min_64f_generic(double* cVector, const double* aVector,
+ const double* bVector, unsigned int num_points)
+{
+ double* cPtr = cVector;
+ const double* aPtr = aVector;
+ const double* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ const double a = *aPtr++;
+ const double b = *bPtr++;
+ *cPtr++ = ( a < b ? a : b);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_64f_x2_min_64f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_64u_byteswap
+ *
+ * \b Overview
+ *
+ * Byteswaps (in-place) an aligned vector of int64_t's.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_64u_byteswap(uint64_t* intsToSwap, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li intsToSwap: The vector of data to byte swap
+ * \li num_points: The number of data points
+ *
+ * \b Outputs
+ * \li intsToSwap: returns as an in-place calculation.
+ *
+ * \b Example
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ *
+ * uint64_t bitstring[] = {0x0, 0x1, 0xf, 0xffffffffffffffff,
+ * 0x5a5a5a5a5a5a5a5a, 0xa5a5a5a5a5a5a5a5, 0x2a2a2a2a2a2a2a2a,
+ * 0xffffffff, 0x32, 0x64};
+ * uint64_t hamming_distance = 0;
+ *
+ * printf("byteswap vector =\n");
+ * for(unsigned int ii=0; ii<N; ++ii){
+ * printf(" %.16lx\n", bitstring[ii]);
+ * }
+ *
+ * volk_64u_byteswap(bitstring, N);
+ *
+ * printf("byteswapped vector =\n");
+ * for(unsigned int ii=0; ii<N; ++ii){
+ * printf(" %.16lx\n", bitstring[ii]);
+ * }
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_64u_byteswap_u_H
+#define INCLUDED_volk_64u_byteswap_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ __m128i input, byte1, byte2, byte3, byte4, output;
+ __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+ __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+ uint64_t number = 0;
+ const unsigned int halfPoints = num_points / 2;
+ for(;number < halfPoints; number++){
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_loadu_si128((__m128i*)inputPtr);
+
+ // Do the four shifts
+ byte1 = _mm_slli_epi32(input, 24);
+ byte2 = _mm_slli_epi32(input, 8);
+ byte3 = _mm_srli_epi32(input, 8);
+ byte4 = _mm_srli_epi32(input, 24);
+ // Or bytes together
+ output = _mm_or_si128(byte1, byte4);
+ byte2 = _mm_and_si128(byte2, byte2mask);
+ output = _mm_or_si128(output, byte2);
+ byte3 = _mm_and_si128(byte3, byte3mask);
+ output = _mm_or_si128(output, byte3);
+
+ // Reorder the two words
+ output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
+
+ // Store the results
+ _mm_storeu_si128((__m128i*)inputPtr, output);
+ inputPtr += 4;
+ }
+
+ // Byteswap any remaining points:
+ number = halfPoints*2;
+ for(; number < num_points; number++){
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+
+ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+ *inputPtr++ = output2;
+ *inputPtr++ = output1;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ unsigned int point;
+ for(point = 0; point < num_points; point++){
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+
+ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+ *inputPtr++ = output2;
+ *inputPtr++ = output1;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_AVX2
+#include <immintrin.h>
+static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const unsigned int nPerSet = 4;
+ const uint64_t nSets = num_points / nPerSet;
+
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+
+ const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
+
+ const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]);
+
+ for ( ;number < nSets; number++ ) {
+
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
+ const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
+
+ // Store the results
+ _mm256_store_si256((__m256i*)inputPtr, output);
+
+ /* inputPtr is 32bit so increment twice */
+ inputPtr += 2 * nPerSet;
+ }
+ _mm256_zeroupper();
+
+ // Byteswap any remaining points:
+ for(number = nSets * nPerSet; number < num_points; ++number ) {
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+ uint32_t out1 = ((((output1) >> 24) & 0x000000ff) |
+ (((output1) >> 8) & 0x0000ff00) |
+ (((output1) << 8) & 0x00ff0000) |
+ (((output1) << 24) & 0xff000000) );
+
+ uint32_t out2 = ((((output2) >> 24) & 0x000000ff) |
+ (((output2) >> 8) & 0x0000ff00) |
+ (((output2) << 8) & 0x00ff0000) |
+ (((output2) << 24) & 0xff000000) );
+ *inputPtr++ = out2;
+ *inputPtr++ = out1;
+ }
+}
+
+#endif /* LV_HAVE_AVX2 */
+
+
+#if LV_HAVE_SSSE3
+#include <tmmintrin.h>
+static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap, unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const unsigned int nPerSet = 2;
+ const uint64_t nSets = num_points / nPerSet;
+
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+
+ uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+
+ const __m128i myShuffle = _mm_loadu_si128((__m128i*) &shuffleVector);
+
+ for ( ;number < nSets; number++ ) {
+
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ const __m128i input = _mm_load_si128((__m128i*)inputPtr);
+ const __m128i output = _mm_shuffle_epi8(input,myShuffle);
+
+ // Store the results
+ _mm_store_si128((__m128i*)inputPtr, output);
+
+ /* inputPtr is 32bit so increment twice */
+ inputPtr += 2 * nPerSet;
+ }
+
+ // Byteswap any remaining points:
+ for(number = nSets * nPerSet; number < num_points; ++number ) {
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+ uint32_t out1 = ((((output1) >> 24) & 0x000000ff) |
+ (((output1) >> 8) & 0x0000ff00) |
+ (((output1) << 8) & 0x00ff0000) |
+ (((output1) << 24) & 0xff000000) );
+
+ uint32_t out2 = ((((output2) >> 24) & 0x000000ff) |
+ (((output2) >> 8) & 0x0000ff00) |
+ (((output2) << 8) & 0x00ff0000) |
+ (((output2) << 24) & 0xff000000) );
+ *inputPtr++ = out2;
+ *inputPtr++ = out1;
+ }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ unsigned int number = 0;
+ unsigned int n8points = num_points / 4;
+
+ uint8x8x4_t input_table;
+ uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
+ uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
+
+ /* these magic numbers are used as byte-indeces in the LUT.
+ they are pre-computed to save time. A simple C program
+ can calculate them; for example for lookup01:
+ uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
+ for(ii=0; ii < 8; ++ii) {
+ index += ((uint64_t)(*(chars+ii))) << (ii*8);
+ }
+ */
+ int_lookup01 = vcreate_u8(2269495096316185);
+ int_lookup23 = vcreate_u8(146949840772469531);
+ int_lookup45 = vcreate_u8(291630186448622877);
+ int_lookup67 = vcreate_u8(436310532124776223);
+
+ for(number = 0; number < n8points; ++number){
+ input_table = vld4_u8((uint8_t*) inputPtr);
+ swapped_int01 = vtbl4_u8(input_table, int_lookup01);
+ swapped_int23 = vtbl4_u8(input_table, int_lookup23);
+ swapped_int45 = vtbl4_u8(input_table, int_lookup45);
+ swapped_int67 = vtbl4_u8(input_table, int_lookup67);
+ vst1_u8((uint8_t*) inputPtr, swapped_int01);
+ vst1_u8((uint8_t*) (inputPtr+2), swapped_int23);
+ vst1_u8((uint8_t*) (inputPtr+4), swapped_int45);
+ vst1_u8((uint8_t*) (inputPtr+6), swapped_int67);
+
+ inputPtr += 4;
+ }
+
+ for(number = n8points * 4; number < num_points; ++number){
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+
+ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+ *inputPtr++ = output2;
+ *inputPtr++ = output1;
+ }
+
+}
+#endif /* LV_HAVE_NEON */
+
+
+#endif /* INCLUDED_volk_64u_byteswap_u_H */
+#ifndef INCLUDED_volk_64u_byteswap_a_H
+#define INCLUDED_volk_64u_byteswap_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ __m128i input, byte1, byte2, byte3, byte4, output;
+ __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+ __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+ uint64_t number = 0;
+ const unsigned int halfPoints = num_points / 2;
+ for(;number < halfPoints; number++){
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_load_si128((__m128i*)inputPtr);
+
+ // Do the four shifts
+ byte1 = _mm_slli_epi32(input, 24);
+ byte2 = _mm_slli_epi32(input, 8);
+ byte3 = _mm_srli_epi32(input, 8);
+ byte4 = _mm_srli_epi32(input, 24);
+ // Or bytes together
+ output = _mm_or_si128(byte1, byte4);
+ byte2 = _mm_and_si128(byte2, byte2mask);
+ output = _mm_or_si128(output, byte2);
+ byte3 = _mm_and_si128(byte3, byte3mask);
+ output = _mm_or_si128(output, byte3);
+
+ // Reorder the two words
+ output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
+
+ // Store the results
+ _mm_store_si128((__m128i*)inputPtr, output);
+ inputPtr += 4;
+ }
+
+ // Byteswap any remaining points:
+ number = halfPoints*2;
+ for(; number < num_points; number++){
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+
+ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+ *inputPtr++ = output2;
+ *inputPtr++ = output1;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_AVX2
+#include <immintrin.h>
+static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap, unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const unsigned int nPerSet = 4;
+ const uint64_t nSets = num_points / nPerSet;
+
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+
+ const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
+
+ const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]);
+
+ for ( ;number < nSets; number++ ) {
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
+ const __m256i output = _mm256_shuffle_epi8(input,myShuffle);
+
+ // Store the results
+ _mm256_storeu_si256((__m256i*)inputPtr, output);
+
+ /* inputPtr is 32bit so increment twice */
+ inputPtr += 2 * nPerSet;
+ }
+ _mm256_zeroupper();
+
+ // Byteswap any remaining points:
+ for(number = nSets * nPerSet; number < num_points; ++number ) {
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+ uint32_t out1 = ((((output1) >> 24) & 0x000000ff) |
+ (((output1) >> 8) & 0x0000ff00) |
+ (((output1) << 8) & 0x00ff0000) |
+ (((output1) << 24) & 0xff000000) );
+
+ uint32_t out2 = ((((output2) >> 24) & 0x000000ff) |
+ (((output2) >> 8) & 0x0000ff00) |
+ (((output2) << 8) & 0x00ff0000) |
+ (((output2) << 24) & 0xff000000) );
+ *inputPtr++ = out2;
+ *inputPtr++ = out1;
+ }
+}
+
+#endif /* LV_HAVE_AVX2 */
+
+
+#if LV_HAVE_SSSE3
+#include <tmmintrin.h>
+static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap, unsigned int num_points)
+{
+ unsigned int number = 0;
+
+ const unsigned int nPerSet = 2;
+ const uint64_t nSets = num_points / nPerSet;
+
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+
+ uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+
+ const __m128i myShuffle = _mm_loadu_si128((__m128i*) &shuffleVector);
+
+ for ( ;number < nSets; number++ ) {
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ const __m128i input = _mm_loadu_si128((__m128i*)inputPtr);
+ const __m128i output = _mm_shuffle_epi8(input,myShuffle);
+
+ // Store the results
+ _mm_storeu_si128((__m128i*)inputPtr, output);
+
+ /* inputPtr is 32bit so increment twice */
+ inputPtr += 2 * nPerSet;
+ }
+
+ // Byteswap any remaining points:
+ for(number = nSets * nPerSet; number < num_points; ++number ) {
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+ uint32_t out1 = ((((output1) >> 24) & 0x000000ff) |
+ (((output1) >> 8) & 0x0000ff00) |
+ (((output1) << 8) & 0x00ff0000) |
+ (((output1) << 24) & 0xff000000) );
+
+ uint32_t out2 = ((((output2) >> 24) & 0x000000ff) |
+ (((output2) >> 8) & 0x0000ff00) |
+ (((output2) << 8) & 0x00ff0000) |
+ (((output2) << 24) & 0xff000000) );
+ *inputPtr++ = out2;
+ *inputPtr++ = out1;
+ }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ unsigned int point;
+ for(point = 0; point < num_points; point++){
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+
+ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+ *inputPtr++ = output2;
+ *inputPtr++ = output1;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_64u_byteswap_a_H */
--- /dev/null
+#ifndef INCLUDED_volk_64u_byteswappuppet_64u_H
+#define INCLUDED_volk_64u_byteswappuppet_64u_H
+
+
+#include <stdint.h>
+#include <volk/volk_64u_byteswap.h>
+#include <string.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_64u_byteswappuppet_64u_generic(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){
+
+ volk_64u_byteswap_generic((uint64_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_NEON
+static inline void volk_64u_byteswappuppet_64u_neon(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){
+
+ volk_64u_byteswap_neon((uint64_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_64u_byteswappuppet_64u_u_sse2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
+
+ volk_64u_byteswap_u_sse2((uint64_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_64u_byteswappuppet_64u_a_sse2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
+
+ volk_64u_byteswap_a_sse2((uint64_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_SSSE3
+static inline void volk_64u_byteswappuppet_64u_u_ssse3(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
+
+ volk_64u_byteswap_u_ssse3((uint64_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_SSSE3
+static inline void volk_64u_byteswappuppet_64u_a_ssse3(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
+
+ volk_64u_byteswap_a_ssse3((uint64_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_AVX2
+static inline void volk_64u_byteswappuppet_64u_u_avx2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
+
+ volk_64u_byteswap_u_avx2((uint64_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_AVX2
+static inline void volk_64u_byteswappuppet_64u_a_avx2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
+
+ volk_64u_byteswap_a_avx2((uint64_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+
+}
+#endif
+
+#endif
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_64u_popcnt
+ *
+ * \b Overview
+ *
+ * Computes the population count (popcnt), or Hamming distance of a
+ * binary string. This kernel takes in a single unsigned 64-bit value
+ * and returns the count of 1's that the value contains.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_64u_popcnt(uint64_t* ret, const uint64_t value)
+ * \endcode
+ *
+ * \b Inputs
+ * \li value: The input value.
+ *
+ * \b Outputs
+ * \li ret: The return value containing the popcnt.
+ *
+ * \b Example
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ *
+ * uint64_t bitstring[] = {0x0, 0x1, 0xf, 0xffffffffffffffff,
+ * 0x5555555555555555, 0xaaaaaaaaaaaaaaaa, 0x2a2a2a2a2a2a2a2a,
+ * 0xffffffff, 0x32, 0x64};
+ * uint64_t hamming_distance = 0;
+ *
+ * for(unsigned int ii=0; ii<N; ++ii){
+ * volk_64u_popcnt(&hamming_distance, bitstring[ii]);
+ * printf("hamming distance of %lx = %li\n", bitstring[ii], hamming_distance);
+ * }
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_64u_popcnt_a_H
+#define INCLUDED_volk_64u_popcnt_a_H
+
+#include <stdio.h>
+#include <inttypes.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void
+volk_64u_popcnt_generic(uint64_t* ret, const uint64_t value)
+{
+ //const uint32_t* valueVector = (const uint32_t*)&value;
+
+ // This is faster than a lookup table
+ //uint32_t retVal = valueVector[0];
+ uint32_t retVal = (uint32_t)(value & 0x00000000FFFFFFFFull);
+
+ retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+ retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+ retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+ retVal = (retVal + (retVal >> 8));
+ retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+ uint64_t retVal64 = retVal;
+
+ //retVal = valueVector[1];
+ retVal = (uint32_t)((value & 0xFFFFFFFF00000000ull) >> 32);
+ retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+ retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+ retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+ retVal = (retVal + (retVal >> 8));
+ retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+ retVal64 += retVal;
+
+ *ret = retVal64;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE4_2 && LV_HAVE_64
+
+#include <nmmintrin.h>
+
+static inline void volk_64u_popcnt_a_sse4_2(uint64_t* ret, const uint64_t value)
+{
+ *ret = _mm_popcnt_u64(value);
+}
+
+#endif /*LV_HAVE_SSE4_2*/
+
+
+#if LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_64u_popcnt_neon(uint64_t* ret, const uint64_t value)
+{
+ uint8x8_t input_val, count8x8_val;
+ uint16x4_t count16x4_val;
+ uint32x2_t count32x2_val;
+ uint64x1_t count64x1_val;
+
+ input_val = vld1_u8((unsigned char *) &value);
+ count8x8_val = vcnt_u8(input_val);
+ count16x4_val = vpaddl_u8(count8x8_val);
+ count32x2_val = vpaddl_u16(count16x4_val);
+ count64x1_val = vpaddl_u32(count32x2_val);
+ vst1_u64(ret, count64x1_val);
+
+ //*ret = _mm_popcnt_u64(value);
+}
+#endif /*LV_HAVE_NEON*/
+
+
+#endif /*INCLUDED_volk_64u_popcnt_a_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_volk_64u_popcntpuppet_64u_H
+#define INCLUDED_volk_64u_popcntpuppet_64u_H
+
+#include <volk/volk_64u_popcnt.h>
+#include <stdint.h>
+#include <string.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_64u_popcntpuppet_64u_generic(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){
+ unsigned int ii;
+ for(ii=0; ii < num_points; ++ii) {
+ volk_64u_popcnt_generic(outVector+ii, num_points );
+ }
+ memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t));
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_SSE4_2 && LV_HAVE_64
+static inline void volk_64u_popcntpuppet_64u_a_sse4_2(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){
+ unsigned int ii;
+ for(ii=0; ii < num_points; ++ii) {
+ volk_64u_popcnt_a_sse4_2(outVector+ii, num_points );
+ }
+ memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t));
+}
+#endif /* LV_HAVE_SSE4_2 */
+
+#ifdef LV_HAVE_NEON
+static inline void volk_64u_popcntpuppet_64u_neon(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){
+ unsigned int ii;
+ for(ii=0; ii < num_points; ++ii) {
+ volk_64u_popcnt_neon(outVector+ii, num_points );
+ }
+ memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t));
+}
+#endif /* LV_HAVE_NEON */
+
+#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_8i_convert_16i
+ *
+ * \b Overview
+ *
+ * Convert the input vector of 8-bit chars to a vector of 16-bit
+ * shorts.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_8i_convert_16i(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The input vector of 8-bit chars.
+ * \li num_points: The number of values.
+ *
+ * \b Outputs
+ * \li outputVector: The output 16-bit shorts.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_8i_convert_16i();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_8i_convert_16i_u_H
+#define INCLUDED_volk_8i_convert_16i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, const int8_t* inputVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+ __m128i* outputVectorPtr = (__m128i*)outputVector;
+ __m128i inputVal;
+ __m128i ret;
+
+ for(;number < sixteenthPoints; number++){
+ inputVal = _mm_loadu_si128(inputVectorPtr);
+ ret = _mm_cvtepi8_epi16(inputVal);
+ ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+ _mm_storeu_si128(outputVectorPtr, ret);
+
+ outputVectorPtr++;
+
+ inputVal = _mm_srli_si128(inputVal, 8);
+ ret = _mm_cvtepi8_epi16(inputVal);
+ ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+ _mm_storeu_si128(outputVectorPtr, ret);
+
+ outputVectorPtr++;
+
+ inputVectorPtr++;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] = (int16_t)(inputVector[number])*256;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_8i_convert_16i_generic(int16_t* outputVector, const int8_t* inputVector,
+ unsigned int num_points)
+{
+ int16_t* outputVectorPtr = outputVector;
+ const int8_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
+
+
+
+#ifndef INCLUDED_volk_8i_convert_16i_a_H
+#define INCLUDED_volk_8i_convert_16i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_8i_convert_16i_a_sse4_1(int16_t* outputVector, const int8_t* inputVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+ __m128i* outputVectorPtr = (__m128i*)outputVector;
+ __m128i inputVal;
+ __m128i ret;
+
+ for(;number < sixteenthPoints; number++){
+ inputVal = _mm_load_si128(inputVectorPtr);
+ ret = _mm_cvtepi8_epi16(inputVal);
+ ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+ _mm_store_si128(outputVectorPtr, ret);
+
+ outputVectorPtr++;
+
+ inputVal = _mm_srli_si128(inputVal, 8);
+ ret = _mm_cvtepi8_epi16(inputVal);
+ ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+ _mm_store_si128(outputVectorPtr, ret);
+
+ outputVectorPtr++;
+
+ inputVectorPtr++;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] = (int16_t)(inputVector[number])*256;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_8i_convert_16i_a_generic(int16_t* outputVector, const int8_t* inputVector,
+ unsigned int num_points)
+{
+ int16_t* outputVectorPtr = outputVector;
+ const int8_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_8i_convert_16i_neon(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points)
+{
+ int16_t* outputVectorPtr = outputVector;
+ const int8_t* inputVectorPtr = inputVector;
+ unsigned int number;
+ const unsigned int eighth_points = num_points / 8;
+
+ int8x8_t input_vec ;
+ int16x8_t converted_vec;
+
+ // NEON doesn't have a concept of 8 bit registers, so we are really
+ // dealing with the low half of 16-bit registers. Since this requires
+ // a move instruction we likely do better with ASM here.
+ for(number = 0; number < eighth_points; ++number) {
+ input_vec = vld1_s8(inputVectorPtr);
+ converted_vec = vmovl_s8(input_vec);
+ //converted_vec = vmulq_s16(converted_vec, scale_factor);
+ converted_vec = vshlq_n_s16(converted_vec, 8);
+ vst1q_s16( outputVectorPtr, converted_vec);
+
+ inputVectorPtr += 8;
+ outputVectorPtr += 8;
+ }
+
+ for(number = eighth_points * 8; number < num_points; number++){
+ *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_ORC
+extern void
+volk_8i_convert_16i_a_orc_impl(int16_t* outputVector, const int8_t* inputVector,
+ unsigned int num_points);
+
+static inline void
+volk_8i_convert_16i_u_orc(int16_t* outputVector, const int8_t* inputVector,
+ unsigned int num_points)
+{
+ volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_8i_s32f_convert_32f
+ *
+ * \b Overview
+ *
+ * Convert the input vector of 8-bit chars to a vector of floats. The
+ * floats are then divided by the scalar factor. shorts.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_8i_s32f_convert_32f(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The input vector of 8-bit chars.
+ * \li scalar: the scaling factor used to divide the results of the conversion.
+ * \li num_points: The number of values.
+ *
+ * \b Outputs
+ * \li outputVector: The output 16-bit shorts.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_8i_s32f_convert_32f();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
+#define INCLUDED_volk_8i_s32f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const int8_t* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1( iScalar );
+ const int8_t* inputVectorPtr = inputVector;
+ __m128 ret;
+ __m128i inputVal;
+ __m128i interimVal;
+
+ for(;number < sixteenthPoints; number++){
+ inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
+
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]) * iScalar;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_8i_s32f_convert_32f_generic(float* outputVector, const int8_t* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ float* outputVectorPtr = outputVector;
+ const int8_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ const float iScalar = 1.0 / scalar;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
+#ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
+#define INCLUDED_volk_8i_s32f_convert_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector, const int8_t* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ const int8_t* inputVectorPtr = inputVector;
+ __m128 ret;
+ __m128i inputVal;
+ __m128i interimVal;
+
+ for(;number < sixteenthPoints; number++){
+ inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
+
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_store_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_store_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_store_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_store_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]) * iScalar;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_8i_s32f_convert_32f_a_generic(float* outputVector, const int8_t* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ float* outputVectorPtr = outputVector;
+ const int8_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ const float iScalar = 1.0 / scalar;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_ORC
+extern void
+volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector, const int8_t* inputVector,
+ const float scalar, unsigned int num_points);
+
+static inline void
+volk_8i_s32f_convert_32f_u_orc(float* outputVector, const int8_t* inputVector,
+ const float scalar, unsigned int num_points)
+{
+ float invscalar = 1.0 / scalar;
+ volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_8ic_deinterleave_16i_x2
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex 8-bit char vector into I & Q vector data
+ * and converts them to 16-bit shorts.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_8ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data.
+ * \li qBuffer: The Q buffer output data.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_8ic_deinterleave_16i_x2();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a_H
+#define INCLUDED_volk_8ic_deinterleave_16i_x2_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16_t* qBuffer,
+ const lv_8sc_t* complexVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); // set 16 byte values
+ __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+ __m128i complexVal, iOutputVal, qOutputVal;
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; // aligned load
+
+ iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask); // shuffle 16 bytes of 128bit complexVal
+ qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
+
+ iOutputVal = _mm_cvtepi8_epi16(iOutputVal); // fills 2-byte sign extended versions of lower 8 bytes of input to output
+ iOutputVal = _mm_slli_epi16(iOutputVal, 8); // shift in left by 8 bits, each of the 8 16-bit integers, shift in with zeros
+
+ qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
+ qOutputVal = _mm_slli_epi16(qOutputVal, 8);
+
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store
+ _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
+ *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer, int16_t* qBuffer,
+ const lv_8sc_t* complexVector, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); // set 16 byte values
+ __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+ __m256i complexVal, iOutputVal, qOutputVal;
+ __m128i complexVal1, complexVal0;
+ __m128i iOutputVal1, iOutputVal0, qOutputVal1, qOutputVal0;
+
+ unsigned int sixteenthPoints = num_points / 16;
+
+ for(number = 0; number < sixteenthPoints; number++){
+ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; // aligned load
+
+ // Extract from complexVal to iOutputVal and qOutputVal
+ complexVal1 = _mm256_extractf128_si256(complexVal, 1);
+ complexVal0 = _mm256_extractf128_si256(complexVal, 0);
+
+ iOutputVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask); // shuffle 16 bytes of 128bit complexVal
+ iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask);
+ qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask);
+ qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask);
+
+ iOutputVal1 = _mm_cvtepi8_epi16(iOutputVal1); // fills 2-byte sign extended versions of lower 8 bytes of input to output
+ iOutputVal1 = _mm_slli_epi16(iOutputVal1, 8); // shift in left by 8 bits, each of the 8 16-bit integers, shift in with zeros
+ iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0);
+ iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8);
+
+ qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1);
+ qOutputVal1 = _mm_slli_epi16(qOutputVal1, 8);
+ qOutputVal0 = _mm_cvtepi8_epi16(qOutputVal0);
+ qOutputVal0 = _mm_slli_epi16(qOutputVal0, 8);
+
+ // Pack iOutputVal0,1 to iOutputVal
+ __m256i dummy = _mm256_setzero_si256();
+ iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0);
+ iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1);
+ qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0);
+ qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1);
+
+ _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); // aligned store
+ _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
+
+ iBufferPtr += 16;
+ qBufferPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
+ *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer,
+ const lv_8sc_t* complexVector, unsigned int num_points)
+{
+ const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ unsigned int number;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
+ *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_8ic_deinterleave_real_16i
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex 8-bit char vector into just the I (real)
+ * vector and converts it to 16-bit shorts.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_8ic_deinterleave_real_16i(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_8ic_deinterleave_real_16i();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_8ic_deinterleave_real_16i_a_H
+#define INCLUDED_volk_8ic_deinterleave_real_16i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ __m128i complexVal, outputVal;
+
+ unsigned int eighthPoints = num_points / 8;
+
+ for(number = 0; number < eighthPoints; number++){
+ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+
+ complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+ outputVal = _mm_cvtepi8_epi16(complexVal);
+ outputVal = _mm_slli_epi16(outputVal, 7);
+
+ _mm_store_si128((__m128i*)iBufferPtr, outputVal);
+ iBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_8ic_deinterleave_real_16i_a_avx(int16_t* iBuffer, const lv_8sc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ __m256i complexVal, outputVal;
+ __m128i complexVal1, complexVal0, outputVal1, outputVal0;
+
+ unsigned int sixteenthPoints = num_points / 16;
+
+ for(number = 0; number < sixteenthPoints; number++){
+ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
+
+ complexVal1 = _mm256_extractf128_si256(complexVal, 1);
+ complexVal0 = _mm256_extractf128_si256(complexVal, 0);
+
+ outputVal1 = _mm_shuffle_epi8(complexVal1, moveMask);
+ outputVal0 = _mm_shuffle_epi8(complexVal0, moveMask);
+
+ outputVal1 = _mm_cvtepi8_epi16(outputVal1);
+ outputVal1 = _mm_slli_epi16(outputVal1, 7);
+ outputVal0 = _mm_cvtepi8_epi16(outputVal0);
+ outputVal0 = _mm_slli_epi16(outputVal0, 7);
+
+ __m256i dummy = _mm256_setzero_si256();
+ outputVal = _mm256_insertf128_si256(dummy, outputVal0, 0);
+ outputVal = _mm256_insertf128_si256(outputVal, outputVal1, 1);
+ _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
+
+ iBufferPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_8ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_8sc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_8ic_deinterleave_real_16i_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_8ic_deinterleave_real_8i
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex 8-bit char vector into just the I (real)
+ * vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_8ic_deinterleave_real_8i(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_8ic_deinterleave_real_8i();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
+#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSSE3
+#include <tmmintrin.h>
+
+static inline void
+volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+ __m128i complexVal1, complexVal2, outputVal;
+
+ unsigned int sixteenthPoints = num_points / 16;
+
+ for(number = 0; number < sixteenthPoints; number++){
+ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+
+ complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1);
+ complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2);
+
+ outputVal = _mm_or_si128(complexVal1, complexVal2);
+
+ _mm_store_si128((__m128i*)iBufferPtr, outputVal);
+ iBufferPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_8ic_deinterleave_real_8i_a_avx(int8_t* iBuffer, const lv_8sc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ __m128i moveMaskL = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ __m128i moveMaskH = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+ __m256i complexVal1, complexVal2, outputVal;
+ __m128i complexVal1H, complexVal1L, complexVal2H, complexVal2L, outputVal1, outputVal2;
+
+ unsigned int thirtysecondPoints = num_points / 32;
+
+ for(number = 0; number < thirtysecondPoints; number++){
+
+ complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+ complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+
+ complexVal1H = _mm256_extractf128_si256(complexVal1, 1);
+ complexVal1L = _mm256_extractf128_si256(complexVal1, 0);
+ complexVal2H = _mm256_extractf128_si256(complexVal2, 1);
+ complexVal2L = _mm256_extractf128_si256(complexVal2, 0);
+
+ complexVal1H = _mm_shuffle_epi8(complexVal1H, moveMaskH);
+ complexVal1L = _mm_shuffle_epi8(complexVal1L, moveMaskL);
+ outputVal1 = _mm_or_si128(complexVal1H, complexVal1L);
+
+
+ complexVal2H = _mm_shuffle_epi8(complexVal2H, moveMaskH);
+ complexVal2L = _mm_shuffle_epi8(complexVal2L, moveMaskL);
+ outputVal2 = _mm_or_si128(complexVal2H, complexVal2L);
+
+ __m256i dummy = _mm256_setzero_si256();
+ outputVal = _mm256_insertf128_si256(dummy, outputVal1, 0);
+ outputVal = _mm256_insertf128_si256(outputVal, outputVal2, 1);
+
+
+ _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
+ iBufferPtr += 32;
+ }
+
+ number = thirtysecondPoints * 32;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_8sc_t* complexVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points)
+{
+ unsigned int number;
+ unsigned int sixteenth_points = num_points / 16;
+
+ int8x16x2_t input_vector;
+ for(number=0; number < sixteenth_points; ++number) {
+ input_vector = vld2q_s8((int8_t*) complexVector );
+ vst1q_s8(iBuffer, input_vector.val[0]);
+ iBuffer += 16;
+ complexVector += 16;
+ }
+
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ for(number = sixteenth_points*16; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_8ic_s32f_deinterleave_32f_x2
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex 8-bit char vector into I & Q vector data,
+ * converts them to floats, and divides the results by the scalar
+ * factor.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_8ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li scalar: The scalar value used to divide the floating point results.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data.
+ * \li qBuffer: The Q buffer output data.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_8ic_s32f_deinterleave_32f_x2();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
+#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector,
+ const float scalar, unsigned int num_points)
+{
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+ __m128 iFloatValue, qFloatValue;
+
+ const float iScalar= 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+ __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+
+ for(;number < eighthPoints; number++){
+ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
+ qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
+
+ iIntVal = _mm_cvtepi8_epi32(iComplexVal);
+ iFloatValue = _mm_cvtepi32_ps(iIntVal);
+ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+ _mm_store_ps(iBufferPtr, iFloatValue);
+ iBufferPtr += 4;
+
+ iComplexVal = _mm_srli_si128(iComplexVal, 4);
+
+ iIntVal = _mm_cvtepi8_epi32(iComplexVal);
+ iFloatValue = _mm_cvtepi32_ps(iIntVal);
+ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+ _mm_store_ps(iBufferPtr, iFloatValue);
+ iBufferPtr += 4;
+
+ qIntVal = _mm_cvtepi8_epi32(qComplexVal);
+ qFloatValue = _mm_cvtepi32_ps(qIntVal);
+ qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
+ _mm_store_ps(qBufferPtr, qFloatValue);
+ qBufferPtr += 4;
+
+ qComplexVal = _mm_srli_si128(qComplexVal, 4);
+
+ qIntVal = _mm_cvtepi8_epi32(qComplexVal);
+ qFloatValue = _mm_cvtepi32_ps(qIntVal);
+ qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
+ _mm_store_ps(qBufferPtr, qFloatValue);
+
+ qBufferPtr += 4;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+ }
+
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer,
+ const lv_8sc_t* complexVector,
+ const float scalar, unsigned int num_points)
+{
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 cplxValue1, cplxValue2, iValue, qValue;
+
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
+
+ for(;number < quarterPoints; number++){
+ floatBuffer[0] = (float)(complexVectorPtr[0]);
+ floatBuffer[1] = (float)(complexVectorPtr[1]);
+ floatBuffer[2] = (float)(complexVectorPtr[2]);
+ floatBuffer[3] = (float)(complexVectorPtr[3]);
+
+ floatBuffer[4] = (float)(complexVectorPtr[4]);
+ floatBuffer[5] = (float)(complexVectorPtr[5]);
+ floatBuffer[6] = (float)(complexVectorPtr[6]);
+ floatBuffer[7] = (float)(complexVectorPtr[7]);
+
+ cplxValue1 = _mm_load_ps(&floatBuffer[0]);
+ cplxValue2 = _mm_load_ps(&floatBuffer[4]);
+
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ _mm_store_ps(iBufferPtr, iValue);
+ _mm_store_ps(qBufferPtr, qValue);
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ complexVectorPtr = (int8_t*)&complexVector[number];
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer,
+ const lv_8sc_t* complexVector,
+ const float scalar, unsigned int num_points)
+{
+ const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+ unsigned int number;
+ const float invScalar = 1.0 / scalar;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
+ *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_8ic_s32f_deinterleave_real_32f
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex 8-bit char vector into just the real (I)
+ * vector, converts the samples to floats, and divides the results by
+ * the scalar factor.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_8ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li scalar: The scalar value used to divide the floating point results.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_8ic_s32f_deinterleave_real_32f();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H
+#define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_8sc_t* complexVector,
+ const float scalar, unsigned int num_points)
+{
+ float* iBufferPtr = iBuffer;
+
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+ __m128 iFloatValue;
+
+ const float iScalar= 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ __m128i complexVal, iIntVal;
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+ __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+
+ for(;number < eighthPoints; number++){
+ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+ complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+ iIntVal = _mm_cvtepi8_epi32(complexVal);
+ iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+ _mm_store_ps(iBufferPtr, iFloatValue);
+
+ iBufferPtr += 4;
+
+ complexVal = _mm_srli_si128(complexVal, 4);
+ iIntVal = _mm_cvtepi8_epi32(complexVal);
+ iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+ _mm_store_ps(iBufferPtr, iFloatValue);
+
+ iBufferPtr += 4;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+ complexVectorPtr++;
+ }
+
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_8sc_t* complexVector,
+ const float scalar, unsigned int num_points)
+{
+ float* iBufferPtr = iBuffer;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ __m128 iValue;
+
+ const float iScalar= 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+ floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+
+ iValue = _mm_load_ps(floatBuffer);
+
+ iValue = _mm_mul_ps(iValue, invScalar);
+
+ _mm_store_ps(iBufferPtr, iValue);
+
+ iBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+ complexVectorPtr++;
+ }
+
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_8ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_8sc_t* complexVector,
+ const float scalar, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+ float* iBufferPtr = iBuffer;
+ const float invScalar = 1.0 / scalar;
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+#endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H
+#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+ \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+ \param cVector The complex vector where the results will be stored
+ \param aVector One of the complex vectors to be multiplied
+ \param bVector The complex vector which will be converted to complex conjugate and multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128i x, y, realz, imagz;
+ lv_16sc_t* c = cVector;
+ const lv_8sc_t* a = aVector;
+ const lv_8sc_t* b = bVector;
+ __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
+
+ for(;number < quarterPoints; number++){
+ // Convert into 8 bit values into 16 bit values
+ x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
+ y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
+
+ // Calculate the ar*cr - ai*(-ci) portions
+ realz = _mm_madd_epi16(x,y);
+
+ // Calculate the complex conjugate of the cr + ci j values
+ y = _mm_sign_epi16(y, conjugateSign);
+
+ // Shift the order of the cr and ci values
+ y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1));
+
+ // Calculate the ar*(-ci) + cr*(ai)
+ imagz = _mm_madd_epi16(x,y);
+
+ _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz)));
+
+ a += 4;
+ b += 4;
+ c += 4;
+ }
+
+ number = quarterPoints * 4;
+ int16_t* c16Ptr = (int16_t*)&cVector[number];
+ int8_t* a8Ptr = (int8_t*)&aVector[number];
+ int8_t* b8Ptr = (int8_t*)&bVector[number];
+ for(; number < num_points; number++){
+ float aReal = (float)*a8Ptr++;
+ float aImag = (float)*a8Ptr++;
+ lv_32fc_t aVal = lv_cmake(aReal, aImag );
+ float bReal = (float)*b8Ptr++;
+ float bImag = (float)*b8Ptr++;
+ lv_32fc_t bVal = lv_cmake( bReal, -bImag );
+ lv_32fc_t temp = aVal * bVal;
+
+ *c16Ptr++ = (int16_t)lv_creal(temp);
+ *c16Ptr++ = (int16_t)lv_cimag(temp);
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+ \param cVector The complex vector where the results will be stored
+ \param aVector One of the complex vectors to be multiplied
+ \param bVector The complex vector which will be converted to complex conjugate and multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ int16_t* c16Ptr = (int16_t*)cVector;
+ int8_t* a8Ptr = (int8_t*)aVector;
+ int8_t* b8Ptr = (int8_t*)bVector;
+ for(number =0; number < num_points; number++){
+ float aReal = (float)*a8Ptr++;
+ float aImag = (float)*a8Ptr++;
+ lv_32fc_t aVal = lv_cmake(aReal, aImag );
+ float bReal = (float)*b8Ptr++;
+ float bImag = (float)*b8Ptr++;
+ lv_32fc_t bVal = lv_cmake( bReal, -bImag );
+ lv_32fc_t temp = aVal * bVal;
+
+ *c16Ptr++ = (int16_t)lv_creal(temp);
+ *c16Ptr++ = (int16_t)lv_cimag(temp);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_8ic_x2_s32f_multiply_conjugate_32fc
+ *
+ * \b Overview
+ *
+ * Multiplys the one complex vector with the complex conjugate of the
+ * second complex vector and stores their results in the third vector
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_8ic_x2_s32f_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: One of the complex vectors to be multiplied.
+ * \li bVector: The complex vector which will be converted to complex conjugate and multiplied.
+ * \li scalar: each output value is scaled by 1/scalar.
+ * \li num_points: The number of complex values in aVector and bVector to be multiplied together and stored into cVector.
+ *
+ * \b Outputs
+ * \li cVector: The complex vector where the results will be stored.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * <FIXME>
+ *
+ * volk_8ic_x2_s32f_multiply_conjugate_32fc();
+ *
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
+#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector,
+ const lv_8sc_t* bVector, const float scalar,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128i x, y, realz, imagz;
+ __m128 ret;
+ lv_32fc_t* c = cVector;
+ const lv_8sc_t* a = aVector;
+ const lv_8sc_t* b = bVector;
+ __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
+
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+
+ for(;number < quarterPoints; number++){
+ // Convert into 8 bit values into 16 bit values
+ x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
+ y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
+
+ // Calculate the ar*cr - ai*(-ci) portions
+ realz = _mm_madd_epi16(x,y);
+
+ // Calculate the complex conjugate of the cr + ci j values
+ y = _mm_sign_epi16(y, conjugateSign);
+
+ // Shift the order of the cr and ci values
+ y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1));
+
+ // Calculate the ar*(-ci) + cr*(ai)
+ imagz = _mm_madd_epi16(x,y);
+
+ // Interleave real and imaginary and then convert to float values
+ ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
+
+ // Normalize the floating point values
+ ret = _mm_mul_ps(ret, invScalar);
+
+ // Store the floating point values
+ _mm_store_ps((float*)c, ret);
+ c += 2;
+
+ // Interleave real and imaginary and then convert to float values
+ ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
+
+ // Normalize the floating point values
+ ret = _mm_mul_ps(ret, invScalar);
+
+ // Store the floating point values
+ _mm_store_ps((float*)c, ret);
+ c += 2;
+
+ a += 4;
+ b += 4;
+ }
+
+ number = quarterPoints * 4;
+ float* cFloatPtr = (float*)&cVector[number];
+ int8_t* a8Ptr = (int8_t*)&aVector[number];
+ int8_t* b8Ptr = (int8_t*)&bVector[number];
+ for(; number < num_points; number++){
+ float aReal = (float)*a8Ptr++;
+ float aImag = (float)*a8Ptr++;
+ lv_32fc_t aVal = lv_cmake(aReal, aImag );
+ float bReal = (float)*b8Ptr++;
+ float bImag = (float)*b8Ptr++;
+ lv_32fc_t bVal = lv_cmake( bReal, -bImag );
+ lv_32fc_t temp = aVal * bVal;
+
+ *cFloatPtr++ = lv_creal(temp) / scalar;
+ *cFloatPtr++ = lv_cimag(temp) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector,
+ const lv_8sc_t* bVector, const float scalar,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ float* cPtr = (float*)cVector;
+ const float invScalar = 1.0 / scalar;
+ int8_t* a8Ptr = (int8_t*)aVector;
+ int8_t* b8Ptr = (int8_t*)bVector;
+ for(number = 0; number < num_points; number++){
+ float aReal = (float)*a8Ptr++;
+ float aImag = (float)*a8Ptr++;
+ lv_32fc_t aVal = lv_cmake(aReal, aImag );
+ float bReal = (float)*b8Ptr++;
+ float bImag = (float)*b8Ptr++;
+ lv_32fc_t bVal = lv_cmake( bReal, -bImag );
+ lv_32fc_t temp = aVal * bVal;
+
+ *cPtr++ = (lv_creal(temp) * invScalar);
+ *cPtr++ = (lv_cimag(temp) * invScalar);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_volk_8u_conv_k7_r2puppet_8u_H
+#define INCLUDED_volk_8u_conv_k7_r2puppet_8u_H
+
+#include <volk/volk.h>
+#include <volk/volk_8u_x4_conv_k7_r2_8u.h>
+#include <string.h>
+
+typedef union {
+ //decision_t is a BIT vector
+ unsigned char* t;
+ unsigned int* w;
+} p_decision_t;
+
+static inline int parity(int x, unsigned char* Partab)
+{
+ x ^= (x >> 16);
+ x ^= (x >> 8);
+ return Partab[x];
+}
+
+static inline int chainback_viterbi(unsigned char* data,
+ unsigned int nbits,
+ unsigned int endstate,
+ unsigned int tailsize,
+ unsigned char* decisions)
+{
+ unsigned char* d;
+ int d_ADDSHIFT = 0;
+ int d_numstates = (1 << 6);
+ int d_decision_t_size = d_numstates/8;
+ unsigned int d_k = 7;
+ int d_framebits = nbits;
+ /* ADDSHIFT and SUBSHIFT make sure that the thing returned is a byte. */
+ d = decisions;
+ /* Make room beyond the end of the encoder register so we can
+ * accumulate a full byte of decoded data
+ */
+
+ endstate = (endstate%d_numstates) << d_ADDSHIFT;
+
+ /* The store into data[] only needs to be done every 8 bits.
+ * But this avoids a conditional branch, and the writes will
+ * combine in the cache anyway
+ */
+
+ d += tailsize * d_decision_t_size ; /* Look past tail */
+ int retval;
+ int dif = tailsize - (d_k - 1);
+ //printf("break, %d, %d\n", dif, (nbits+dif)%d_framebits);
+ p_decision_t dec;
+ while(nbits-- > d_framebits - (d_k - 1)) {
+ int k;
+ dec.t = &d[nbits * d_decision_t_size];
+ k = (dec.w[(endstate>>d_ADDSHIFT)/32] >> ((endstate>>d_ADDSHIFT)%32)) & 1;
+
+ endstate = (endstate >> 1) | (k << (d_k-2+d_ADDSHIFT));
+ //data[((nbits+dif)%nbits)>>3] = endstate>>d_SUBSHIFT;
+ //printf("%d, %d\n", k, (nbits+dif)%d_framebits);
+ data[((nbits+dif)%d_framebits)] = k;
+
+ retval = endstate;
+ }
+ nbits += 1;
+
+ while(nbits-- != 0) {
+ int k;
+
+ dec.t = &d[nbits * d_decision_t_size];
+
+ k = (dec.w[(endstate>>d_ADDSHIFT)/32] >> ((endstate>>d_ADDSHIFT)%32)) & 1;
+
+ endstate = (endstate >> 1) | (k << (d_k-2+d_ADDSHIFT));
+ data[((nbits+dif)%d_framebits)] = k;
+ }
+ //printf("%d, %d, %d, %d, %d, %d, %d, %d\n", data[4095],data[4094],data[4093],data[4092],data[4091],data[4090],data[4089],data[4088]);
+
+
+ return retval >> d_ADDSHIFT;
+}
+
+
+#if LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#include <mmintrin.h>
+#include <stdio.h>
+
+
+
+
+
+static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms, unsigned char* dec, unsigned int framebits) {
+
+
+ static int once = 1;
+ int d_numstates = (1 << 6);
+ int rate = 2;
+ static unsigned char* D;
+ static unsigned char* Y;
+ static unsigned char* X;
+ static unsigned int excess = 6;
+ static unsigned char* Branchtab;
+ static unsigned char Partab[256];
+
+ int d_polys[2] = {79, 109};
+
+
+ if(once) {
+
+ X = (unsigned char*)volk_malloc(2*d_numstates, volk_get_alignment());
+ Y = X + d_numstates;
+ Branchtab = (unsigned char*)volk_malloc(d_numstates/2*rate, volk_get_alignment());
+ D = (unsigned char*)volk_malloc((d_numstates/8) * (framebits + 6), volk_get_alignment());
+ int state, i;
+ int cnt,ti;
+
+ /* Initialize parity lookup table */
+ for(i=0;i<256;i++){
+ cnt = 0;
+ ti = i;
+ while(ti){
+ if(ti & 1)
+ cnt++;
+ ti >>= 1;
+ }
+ Partab[i] = cnt & 1;
+ }
+ /* Initialize the branch table */
+ for(state=0;state < d_numstates/2;state++){
+ for(i=0; i<rate; i++){
+ Branchtab[i*d_numstates/2+state] = (d_polys[i] < 0) ^ parity((2*state) & abs(d_polys[i]), Partab) ? 255 : 0;
+ }
+ }
+
+ once = 0;
+ }
+
+ //unbias the old_metrics
+ memset(X, 31, d_numstates);
+
+ volk_8u_x4_conv_k7_r2_8u_spiral(Y, X, syms, D, framebits/2 - excess, excess, Branchtab);
+
+ unsigned int min = X[0];
+ int i = 0, state = 0;
+ for(i = 0; i < (d_numstates); ++i) {
+ if(X[i] < min) {
+ min = X[i];
+ state = i;
+ }
+ }
+
+ chainback_viterbi(dec, framebits/2 -excess, state, excess, D);
+
+ return;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+
+
+
+
+#if LV_HAVE_GENERIC
+
+
+static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* syms, unsigned char* dec, unsigned int framebits) {
+
+
+
+ static int once = 1;
+ int d_numstates = (1 << 6);
+ int rate = 2;
+ static unsigned char* Y;
+ static unsigned char* X;
+ static unsigned char* D;
+ static unsigned int excess = 6;
+ static unsigned char* Branchtab;
+ static unsigned char Partab[256];
+
+ int d_polys[2] = {79, 109};
+
+
+ if(once) {
+
+ X = (unsigned char*)volk_malloc(2*d_numstates, volk_get_alignment());
+ Y = X + d_numstates;
+ Branchtab = (unsigned char*)volk_malloc(d_numstates/2*rate, volk_get_alignment());
+ D = (unsigned char*)volk_malloc((d_numstates/8) * (framebits + 6), volk_get_alignment());
+
+ int state, i;
+ int cnt,ti;
+
+ /* Initialize parity lookup table */
+ for(i=0;i<256;i++){
+ cnt = 0;
+ ti = i;
+ while(ti){
+ if(ti & 1)
+ cnt++;
+ ti >>= 1;
+ }
+ Partab[i] = cnt & 1;
+ }
+ /* Initialize the branch table */
+ for(state=0;state < d_numstates/2;state++){
+ for(i=0; i<rate; i++){
+ Branchtab[i*d_numstates/2+state] = (d_polys[i] < 0) ^ parity((2*state) & abs(d_polys[i]), Partab) ? 255 : 0;
+ }
+ }
+
+ once = 0;
+ }
+
+
+
+
+ //unbias the old_metrics
+ memset(X, 31, d_numstates);
+
+ volk_8u_x4_conv_k7_r2_8u_generic(Y, X, syms, D, framebits/2 - excess, excess, Branchtab);
+
+ unsigned int min = X[0];
+ int i = 0, state = 0;
+ for(i = 0; i < (d_numstates); ++i) {
+ if(X[i] < min) {
+ min = X[i];
+ state = i;
+ }
+ }
+
+ chainback_viterbi(dec, framebits/2 -excess, state, excess, D);
+
+ return;
+
+
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /*INCLUDED_volk_8u_conv_k7_r2puppet_8u_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2015 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * for documentation see 'volk_8u_x3_encodepolar_8u_x2.h'
+ */
+
+#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
+#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
+#include <string.h>
+
+static inline unsigned int
+log2_of_power_of_2(unsigned int val){
+ // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog
+ static const unsigned int b[] = {0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0,
+ 0xFF00FF00, 0xFFFF0000};
+
+ unsigned int res = (val & b[0]) != 0;
+ res |= ((val & b[4]) != 0) << 4;
+ res |= ((val & b[3]) != 0) << 3;
+ res |= ((val & b[2]) != 0) << 2;
+ res |= ((val & b[1]) != 0) << 1;
+ return res;
+}
+
+static inline void
+encodepolar_single_stage(unsigned char* frame_ptr, const unsigned char* temp_ptr,
+ const unsigned int num_branches, const unsigned int frame_half)
+{
+ unsigned int branch, bit;
+ for(branch = 0; branch < num_branches; ++branch){
+ for(bit = 0; bit < frame_half; ++bit){
+ *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
+ *(frame_ptr + frame_half) = *(temp_ptr + 1);
+ ++frame_ptr;
+ temp_ptr += 2;
+ }
+ frame_ptr += frame_half;
+ }
+}
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame, unsigned char* temp,
+ unsigned int frame_size)
+{
+ unsigned int stage = log2_of_power_of_2(frame_size);
+ unsigned int frame_half = frame_size >> 1;
+ unsigned int num_branches = 1;
+
+ while(stage){
+ // encode stage
+ encodepolar_single_stage(frame, temp, num_branches, frame_half);
+ memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+
+ // update all the parameters.
+ num_branches = num_branches << 1;
+ frame_half = frame_half >> 1;
+ --stage;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_SSSE3
+#include <tmmintrin.h>
+
+static inline void
+volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame, unsigned char* temp,
+ unsigned int frame_size)
+{
+ const unsigned int po2 = log2_of_power_of_2(frame_size);
+
+ unsigned int stage = po2;
+ unsigned char* frame_ptr = frame;
+ unsigned char* temp_ptr = temp;
+
+ unsigned int frame_half = frame_size >> 1;
+ unsigned int num_branches = 1;
+ unsigned int branch;
+ unsigned int bit;
+
+ // prepare constants
+ const __m128i mask_stage1 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
+
+ // get some SIMD registers to play with.
+ __m128i r_frame0, r_temp0, shifted;
+
+ {
+ __m128i r_frame1, r_temp1;
+ const __m128i shuffle_separate = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+
+ while(stage > 4){
+ frame_ptr = frame;
+ temp_ptr = temp;
+
+ // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
+ for(branch = 0; branch < num_branches; ++branch){
+ for(bit = 0; bit < frame_half; bit += 16){
+ r_temp0 = _mm_loadu_si128((__m128i *) temp_ptr);
+ temp_ptr += 16;
+ r_temp1 = _mm_loadu_si128((__m128i *) temp_ptr);
+ temp_ptr += 16;
+
+ shifted = _mm_srli_si128(r_temp0, 1);
+ shifted = _mm_and_si128(shifted, mask_stage1);
+ r_temp0 = _mm_xor_si128(shifted, r_temp0);
+ r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
+
+ shifted = _mm_srli_si128(r_temp1, 1);
+ shifted = _mm_and_si128(shifted, mask_stage1);
+ r_temp1 = _mm_xor_si128(shifted, r_temp1);
+ r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
+
+ r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
+ _mm_storeu_si128((__m128i*) frame_ptr, r_frame0);
+
+ r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
+ _mm_storeu_si128((__m128i*) (frame_ptr + frame_half), r_frame1);
+ frame_ptr += 16;
+ }
+
+ frame_ptr += frame_half;
+ }
+ memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+
+ num_branches = num_branches << 1;
+ frame_half = frame_half >> 1;
+ stage--;
+ }
+ }
+
+ // This last part requires at least 16-bit frames.
+ // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
+
+ // reset pointers to correct positions.
+ frame_ptr = frame;
+ temp_ptr = temp;
+
+ // load first chunk.
+ // Tests show a 1-2% gain compared to loading a new chunk and using it right after.
+ r_temp0 = _mm_loadu_si128((__m128i*) temp_ptr);
+ temp_ptr += 16;
+
+ const __m128i shuffle_stage4 = _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
+ const __m128i mask_stage4 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+ const __m128i mask_stage3 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
+ const __m128i mask_stage2 = _mm_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
+
+ for(branch = 0; branch < num_branches; ++branch){
+ // shuffle once for bit-reversal.
+ r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
+
+ shifted = _mm_srli_si128(r_temp0, 8);
+ shifted = _mm_and_si128(shifted, mask_stage4);
+ r_frame0 = _mm_xor_si128(shifted, r_temp0);
+
+ // start loading next chunk.
+ r_temp0 = _mm_loadu_si128((__m128i*) temp_ptr);
+ temp_ptr += 16;
+
+ shifted = _mm_srli_si128(r_frame0, 4);
+ shifted = _mm_and_si128(shifted, mask_stage3);
+ r_frame0 = _mm_xor_si128(shifted, r_frame0);
+
+ shifted = _mm_srli_si128(r_frame0, 2);
+ shifted = _mm_and_si128(shifted, mask_stage2);
+ r_frame0 = _mm_xor_si128(shifted, r_frame0);
+
+ shifted = _mm_srli_si128(r_frame0, 1);
+ shifted = _mm_and_si128(shifted, mask_stage1);
+ r_frame0 = _mm_xor_si128(shifted, r_frame0);
+
+ // store result of chunk.
+ _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
+ frame_ptr += 16;
+ }
+}
+
+#endif /* LV_HAVE_SSSE3 */
+
+#endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_ */
+
+#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
+#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
+
+#ifdef LV_HAVE_SSSE3
+#include <tmmintrin.h>
+
+static inline void
+volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame, unsigned char* temp,
+ unsigned int frame_size)
+{
+ const unsigned int po2 = log2_of_power_of_2(frame_size);
+
+ unsigned int stage = po2;
+ unsigned char* frame_ptr = frame;
+ unsigned char* temp_ptr = temp;
+
+ unsigned int frame_half = frame_size >> 1;
+ unsigned int num_branches = 1;
+ unsigned int branch;
+ unsigned int bit;
+
+ // prepare constants
+ const __m128i mask_stage1 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
+
+ // get some SIMD registers to play with.
+ __m128i r_frame0, r_temp0, shifted;
+
+ {
+ __m128i r_frame1, r_temp1;
+ const __m128i shuffle_separate = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+
+ while(stage > 4){
+ frame_ptr = frame;
+ temp_ptr = temp;
+
+ // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
+ for(branch = 0; branch < num_branches; ++branch){
+ for(bit = 0; bit < frame_half; bit += 16){
+ r_temp0 = _mm_load_si128((__m128i *) temp_ptr);
+ temp_ptr += 16;
+ r_temp1 = _mm_load_si128((__m128i *) temp_ptr);
+ temp_ptr += 16;
+
+ shifted = _mm_srli_si128(r_temp0, 1);
+ shifted = _mm_and_si128(shifted, mask_stage1);
+ r_temp0 = _mm_xor_si128(shifted, r_temp0);
+ r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
+
+ shifted = _mm_srli_si128(r_temp1, 1);
+ shifted = _mm_and_si128(shifted, mask_stage1);
+ r_temp1 = _mm_xor_si128(shifted, r_temp1);
+ r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
+
+ r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
+ _mm_store_si128((__m128i*) frame_ptr, r_frame0);
+
+ r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
+ _mm_store_si128((__m128i*) (frame_ptr + frame_half), r_frame1);
+ frame_ptr += 16;
+ }
+
+ frame_ptr += frame_half;
+ }
+ memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+
+ num_branches = num_branches << 1;
+ frame_half = frame_half >> 1;
+ stage--;
+ }
+ }
+
+ // This last part requires at least 16-bit frames.
+ // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
+
+ // reset pointers to correct positions.
+ frame_ptr = frame;
+ temp_ptr = temp;
+
+ // load first chunk.
+ // Tests show a 1-2% gain compared to loading a new chunk and using it right after.
+ r_temp0 = _mm_load_si128((__m128i*) temp_ptr);
+ temp_ptr += 16;
+
+ const __m128i shuffle_stage4 = _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
+ const __m128i mask_stage4 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+ const __m128i mask_stage3 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
+ const __m128i mask_stage2 = _mm_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
+
+ for(branch = 0; branch < num_branches; ++branch){
+ // shuffle once for bit-reversal.
+ r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
+
+ shifted = _mm_srli_si128(r_temp0, 8);
+ shifted = _mm_and_si128(shifted, mask_stage4);
+ r_frame0 = _mm_xor_si128(shifted, r_temp0);
+
+ // start loading next chunk.
+ r_temp0 = _mm_load_si128((__m128i*) temp_ptr);
+ temp_ptr += 16;
+
+ shifted = _mm_srli_si128(r_frame0, 4);
+ shifted = _mm_and_si128(shifted, mask_stage3);
+ r_frame0 = _mm_xor_si128(shifted, r_frame0);
+
+ shifted = _mm_srli_si128(r_frame0, 2);
+ shifted = _mm_and_si128(shifted, mask_stage2);
+ r_frame0 = _mm_xor_si128(shifted, r_frame0);
+
+ shifted = _mm_srli_si128(r_frame0, 1);
+ shifted = _mm_and_si128(shifted, mask_stage1);
+ r_frame0 = _mm_xor_si128(shifted, r_frame0);
+
+ // store result of chunk.
+ _mm_store_si128((__m128i*)frame_ptr, r_frame0);
+ frame_ptr += 16;
+ }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ */
+
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2015 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_8u_x3_encodepolar_8u
+ *
+ * \b Overview
+ *
+ * encode given data for POLAR code
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_8u_x3_encodepolar_8u(unsigned char* frame, const unsigned char* frozen_bit_mask, const unsigned char* frozen_bits,
+ * const unsigned char* info_bits, unsigned int frame_size, unsigned int info_bit_size)
+ * \endcode
+ *
+ * \b Inputs
+ * \li frame: buffer for encoded frame
+ * \li frozen_bit_mask: bytes with 0xFF for frozen bit positions or 0x00 otherwise.
+ * \li frozen_bits: values of frozen bits, 1 bit per byte
+ * \li info_bits: info bit values, 1 bit per byte
+ * \li frame_size: power of 2 value for frame size.
+ * \li info_bit_size: number of info bits in a frame
+ *
+ * \b Outputs
+ * \li frame: polar encoded frame.
+ *
+ * \b Example
+ * \code
+ * int frame_exp = 10;
+ * int frame_size = 0x01 << frame_exp;
+ * int num_info_bits = frame_size;
+ * int num_frozen_bits = frame_size - num_info_bits;
+ *
+ * // function sets frozenbit positions to 0xff and all others to 0x00.
+ * unsigned char* frozen_bit_mask = get_frozen_bit_mask(frame_size, num_frozen_bits);
+ *
+ * // set elements to desired values. Typically all zero.
+ * unsigned char* frozen_bits = (unsigned char) volk_malloc(sizeof(unsigned char) * num_frozen_bits, volk_get_alignment());
+ *
+ * unsigned char* frame = (unsigned char) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
+ * unsigned char* temp = (unsigned char) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
+ *
+ * unsigned char* info_bits = get_info_bits_to_encode(num_info_bits);
+ *
+ * volk_8u_x3_encodepolar_8u_x2_generic(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+ *
+ * volk_free(frozen_bit_mask);
+ * volk_free(frozen_bits);
+ * volk_free(frame);
+ * volk_free(temp);
+ * volk_free(info_bits);
+ * \endcode
+ */
+
+#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLAR_8U_X2_U_H_
+#define VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLAR_8U_X2_U_H_
+#include <stdio.h>
+#include <volk/volk_8u_x2_encodeframepolar_8u.h>
+
+static inline void
+interleave_frozen_and_info_bits(unsigned char* target, const unsigned char* frozen_bit_mask,
+ const unsigned char* frozen_bits, const unsigned char* info_bits,
+ const unsigned int frame_size)
+{
+ unsigned int bit;
+ for(bit = 0; bit < frame_size; ++bit){
+ *target++ = *frozen_bit_mask++ ? *frozen_bits++ : *info_bits++;
+ }
+}
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_8u_x3_encodepolar_8u_x2_generic(unsigned char* frame, unsigned char* temp, const unsigned char* frozen_bit_mask,
+ const unsigned char* frozen_bits, const unsigned char* info_bits,
+ unsigned int frame_size)
+{
+ // interleave
+ interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+ volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size);
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSSE3
+#include <tmmintrin.h>
+
+static inline void
+volk_8u_x3_encodepolar_8u_x2_u_ssse3(unsigned char* frame, unsigned char* temp,
+ const unsigned char* frozen_bit_mask,
+ const unsigned char* frozen_bits, const unsigned char* info_bits,
+ unsigned int frame_size)
+{
+ // interleave
+ interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+ volk_8u_x2_encodeframepolar_8u_u_ssse3(frame, temp, frame_size);
+}
+
+#endif /* LV_HAVE_SSSE3 */
+
+#endif /* VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLAR_8U_X2_U_H_ */
+
+#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLAR_8U_X2_A_H_
+#define VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLAR_8U_X2_A_H_
+
+#ifdef LV_HAVE_SSSE3
+#include <tmmintrin.h>
+static inline void
+volk_8u_x3_encodepolar_8u_x2_a_ssse3(unsigned char* frame, unsigned char* temp,
+ const unsigned char* frozen_bit_mask,
+ const unsigned char* frozen_bits, const unsigned char* info_bits,
+ unsigned int frame_size)
+{
+ interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+ volk_8u_x2_encodeframepolar_8u_a_ssse3(frame, temp, frame_size);
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#endif /* VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLAR_8U_X2_A_H_ */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2015 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/* For documentation see 'kernels/volk/volk_8u_x3_encodepolar_8u_x2.h'
+ * This file exists for test purposes only. Should not be used directly.
+ */
+
+#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLARPUPPET_8U_H_
+#define VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLARPUPPET_8U_H_
+#include <volk/volk.h>
+#include <volk/volk_8u_x3_encodepolar_8u_x2.h>
+
+static inline unsigned int
+next_lower_power_of_two(const unsigned int val)
+{
+ // algorithm found and adopted from: http://acius2.blogspot.de/2007/11/calculating-next-power-of-2.html
+ unsigned int res = val;
+ res = (res >> 1) | res;
+ res = (res >> 2) | res;
+ res = (res >> 4) | res;
+ res = (res >> 8) | res;
+ res = (res >> 16) | res;
+ res += 1;
+ return res >> 1;
+}
+
+static inline void
+adjust_frozen_mask(unsigned char* mask, const unsigned int frame_size)
+{
+ // just like the rest of the puppet this function exists for test purposes only.
+ unsigned int i;
+ for(i = 0; i < frame_size; ++i){
+ *mask = (*mask & 0x80) ? 0xFF : 0x00;
+ mask++;
+ }
+}
+
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_8u_x3_encodepolarpuppet_8u_generic(unsigned char* frame, unsigned char* frozen_bit_mask,
+ const unsigned char* frozen_bits, const unsigned char* info_bits,
+ unsigned int frame_size)
+{
+ frame_size = next_lower_power_of_two(frame_size);
+ unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
+ adjust_frozen_mask(frozen_bit_mask, frame_size);
+ volk_8u_x3_encodepolar_8u_x2_generic(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+ volk_free(temp);
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSSE3
+static inline void
+volk_8u_x3_encodepolarpuppet_8u_u_ssse3(unsigned char* frame, unsigned char* frozen_bit_mask,
+ const unsigned char* frozen_bits, const unsigned char* info_bits,
+ unsigned int frame_size)
+{
+ frame_size = next_lower_power_of_two(frame_size);
+ unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
+ adjust_frozen_mask(frozen_bit_mask, frame_size);
+ volk_8u_x3_encodepolar_8u_x2_u_ssse3(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+ volk_free(temp);
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#endif /* VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLARPUPPET_8U_H_ */
+
+#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLARPUPPET_8U_A_H_
+#define VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLARPUPPET_8U_A_H_
+
+#ifdef LV_HAVE_SSSE3
+static inline void
+volk_8u_x3_encodepolarpuppet_8u_a_ssse3(unsigned char* frame, unsigned char* frozen_bit_mask,
+ const unsigned char* frozen_bits, const unsigned char* info_bits,
+ unsigned int frame_size)
+{
+ frame_size = next_lower_power_of_two(frame_size);
+ unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
+ adjust_frozen_mask(frozen_bit_mask, frame_size);
+ volk_8u_x3_encodepolar_8u_x2_a_ssse3(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+ volk_free(temp);
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#endif /* VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLARPUPPET_8U_A_H_ */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_8u_x4_conv_k7_r2_8u
+ *
+ * \b Overview
+ *
+ * Performs convolutional decoding for a K=7, rate 1/2 convolutional
+ * code. The polynomials user defined.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_8u_x4_conv_k7_r2_8u(unsigned char* Y, unsigned char* X, unsigned char* syms, unsigned char* dec, unsigned int framebits, unsigned int excess, unsigned char* Branchtab)
+ * \endcode
+ *
+ * \b Inputs
+ * \li X: <FIXME>
+ * \li syms: <FIXME>
+ * \li dec: <FIXME>
+ * \li framebits: size of the frame to decode in bits.
+ * \li excess: <FIXME>
+ * \li Branchtab: <FIXME>
+ *
+ * \b Outputs
+ * \li Y: The decoded output bits.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_8u_x4_conv_k7_r2_8u();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
+#define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
+
+typedef union {
+ unsigned char/*DECISIONTYPE*/ t[64/*NUMSTATES*//8/*DECISIONTYPE_BITSIZE*/];
+ unsigned int w[64/*NUMSTATES*//32];
+ unsigned short s[64/*NUMSTATES*//16];
+ unsigned char c[64/*NUMSTATES*//8];
+#ifdef _MSC_VER
+} decision_t;
+#else
+} decision_t __attribute__ ((aligned (16)));
+#endif
+
+
+static inline void
+renormalize(unsigned char* X, unsigned char threshold)
+{
+ int NUMSTATES = 64;
+ int i;
+
+ unsigned char min=X[0];
+ //if(min > threshold) {
+ for(i=0;i<NUMSTATES;i++)
+ if (min>X[i])
+ min=X[i];
+ for(i=0;i<NUMSTATES;i++)
+ X[i]-=min;
+ //}
+}
+
+
+//helper BFLY for GENERIC version
+static inline void
+BFLY(int i, int s, unsigned char * syms, unsigned char *Y,
+ unsigned char *X, decision_t * d, unsigned char* Branchtab)
+{
+ int j, decision0, decision1;
+ unsigned char metric,m0,m1,m2,m3;
+
+ int NUMSTATES = 64;
+ int RATE = 2;
+ int METRICSHIFT = 1;
+ int PRECISIONSHIFT = 2;
+
+ metric =0;
+ for(j=0;j<RATE;j++)
+ metric += (Branchtab[i+j*NUMSTATES/2] ^ syms[s*RATE+j])>>METRICSHIFT;
+ metric=metric>>PRECISIONSHIFT;
+
+ unsigned char max = ((RATE*((256 -1)>>METRICSHIFT))>>PRECISIONSHIFT);
+
+ m0 = X[i] + metric;
+ m1 = X[i+NUMSTATES/2] + (max - metric);
+ m2 = X[i] + (max - metric);
+ m3 = X[i+NUMSTATES/2] + metric;
+
+ decision0 = (signed int)(m0-m1) > 0;
+ decision1 = (signed int)(m2-m3) > 0;
+
+ Y[2*i] = decision0 ? m1 : m0;
+ Y[2*i+1] = decision1 ? m3 : m2;
+
+ d->w[i/(sizeof(unsigned int)*8/2)+s*(sizeof(decision_t)/sizeof(unsigned int))] |=
+ (decision0|decision1<<1) << ((2*i)&(sizeof(unsigned int)*8-1));
+}
+
+
+#if LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#include <mmintrin.h>
+#include <stdio.h>
+
+static inline void
+volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y, unsigned char* X,
+ unsigned char* syms, unsigned char* dec,
+ unsigned int framebits, unsigned int excess,
+ unsigned char* Branchtab)
+{
+ unsigned int i9;
+ for(i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
+ unsigned char a75, a81;
+ int a73, a92;
+ short int s20, s21, s26, s27;
+ unsigned char *a74, *a80, *b6;
+ short int *a110, *a111, *a91, *a93, *a94;
+ __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83
+ , *a95, *a96, *a97, *a98, *a99;
+ __m128i a105, a106, a86, a87;
+ __m128i a100, a101, a103, a104, a107, a108, a109
+ , a76, a78, a79, a82, a84, a85, a88, a89
+ , a90, d10, d11, d12, d9, m23, m24, m25
+ , m26, m27, m28, m29, m30, s18, s19, s22
+ , s23, s24, s25, s28, s29, t13, t14, t15
+ , t16, t17, t18;
+ a71 = ((__m128i *) X);
+ s18 = *(a71);
+ a72 = (a71 + 2);
+ s19 = *(a72);
+ a73 = (4 * i9);
+ a74 = (syms + a73);
+ a75 = *(a74);
+ a76 = _mm_set1_epi8(a75);
+ a77 = ((__m128i *) Branchtab);
+ a78 = *(a77);
+ a79 = _mm_xor_si128(a76, a78);
+ b6 = (a73 + syms);
+ a80 = (b6 + 1);
+ a81 = *(a80);
+ a82 = _mm_set1_epi8(a81);
+ a83 = (a77 + 2);
+ a84 = *(a83);
+ a85 = _mm_xor_si128(a82, a84);
+ t13 = _mm_avg_epu8(a79,a85);
+ a86 = ((__m128i ) t13);
+ a87 = _mm_srli_epi16(a86, 2);
+ a88 = ((__m128i ) a87);
+ t14 = _mm_and_si128(a88, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
+ , 63, 63, 63, 63, 63, 63, 63, 63
+ , 63));
+ t15 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
+ , 63, 63, 63, 63, 63, 63, 63, 63
+ , 63), t14);
+ m23 = _mm_adds_epu8(s18, t14);
+ m24 = _mm_adds_epu8(s19, t15);
+ m25 = _mm_adds_epu8(s18, t15);
+ m26 = _mm_adds_epu8(s19, t14);
+ a89 = _mm_min_epu8(m24, m23);
+ d9 = _mm_cmpeq_epi8(a89, m24);
+ a90 = _mm_min_epu8(m26, m25);
+ d10 = _mm_cmpeq_epi8(a90, m26);
+ s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9,d10));
+ a91 = ((short int *) dec);
+ a92 = (8 * i9);
+ a93 = (a91 + a92);
+ *(a93) = s20;
+ s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9,d10));
+ a94 = (a93 + 1);
+ *(a94) = s21;
+ s22 = _mm_unpacklo_epi8(a89, a90);
+ s23 = _mm_unpackhi_epi8(a89, a90);
+ a95 = ((__m128i *) Y);
+ *(a95) = s22;
+ a96 = (a95 + 1);
+ *(a96) = s23;
+ a97 = (a71 + 1);
+ s24 = *(a97);
+ a98 = (a71 + 3);
+ s25 = *(a98);
+ a99 = (a77 + 1);
+ a100 = *(a99);
+ a101 = _mm_xor_si128(a76, a100);
+ a102 = (a77 + 3);
+ a103 = *(a102);
+ a104 = _mm_xor_si128(a82, a103);
+ t16 = _mm_avg_epu8(a101,a104);
+ a105 = ((__m128i ) t16);
+ a106 = _mm_srli_epi16(a105, 2);
+ a107 = ((__m128i ) a106);
+ t17 = _mm_and_si128(a107, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
+ , 63, 63, 63, 63, 63, 63, 63, 63
+ , 63));
+ t18 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
+ , 63, 63, 63, 63, 63, 63, 63, 63
+ , 63), t17);
+ m27 = _mm_adds_epu8(s24, t17);
+ m28 = _mm_adds_epu8(s25, t18);
+ m29 = _mm_adds_epu8(s24, t18);
+ m30 = _mm_adds_epu8(s25, t17);
+ a108 = _mm_min_epu8(m28, m27);
+ d11 = _mm_cmpeq_epi8(a108, m28);
+ a109 = _mm_min_epu8(m30, m29);
+ d12 = _mm_cmpeq_epi8(a109, m30);
+ s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11,d12));
+ a110 = (a93 + 2);
+ *(a110) = s26;
+ s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11,d12));
+ a111 = (a93 + 3);
+ *(a111) = s27;
+ s28 = _mm_unpacklo_epi8(a108, a109);
+ s29 = _mm_unpackhi_epi8(a108, a109);
+ a112 = (a95 + 2);
+ *(a112) = s28;
+ a113 = (a95 + 3);
+ *(a113) = s29;
+ if ((((unsigned char *) Y)[0]>210)) {
+ __m128i m5, m6;
+ m5 = ((__m128i *) Y)[0];
+ m5 = _mm_min_epu8(m5, ((__m128i *) Y)[1]);
+ m5 = _mm_min_epu8(m5, ((__m128i *) Y)[2]);
+ m5 = _mm_min_epu8(m5, ((__m128i *) Y)[3]);
+ __m128i m7;
+ m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
+ m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 32)), ((__m128i ) m7)));
+ m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 16)), ((__m128i ) m7)));
+ m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 8)), ((__m128i ) m7)));
+ m7 = _mm_unpacklo_epi8(m7, m7);
+ m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
+ m6 = _mm_unpacklo_epi64(m7, m7);
+ ((__m128i *) Y)[0] = _mm_subs_epu8(((__m128i *) Y)[0], m6);
+ ((__m128i *) Y)[1] = _mm_subs_epu8(((__m128i *) Y)[1], m6);
+ ((__m128i *) Y)[2] = _mm_subs_epu8(((__m128i *) Y)[2], m6);
+ ((__m128i *) Y)[3] = _mm_subs_epu8(((__m128i *) Y)[3], m6);
+ }
+ unsigned char a188, a194;
+ int a186, a205;
+ short int s48, s49, s54, s55;
+ unsigned char *a187, *a193, *b15;
+ short int *a204, *a206, *a207, *a223, *a224, *b16;
+ __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210
+ , *a211, *a212, *a215, *a225, *a226;
+ __m128i a199, a200, a218, a219;
+ __m128i a189, a191, a192, a195, a197, a198, a201
+ , a202, a203, a213, a214, a216, a217, a220, a221
+ , a222, d17, d18, d19, d20, m39, m40, m41
+ , m42, m43, m44, m45, m46, s46, s47, s50
+ , s51, s52, s53, s56, s57, t25, t26, t27
+ , t28, t29, t30;
+ a184 = ((__m128i *) Y);
+ s46 = *(a184);
+ a185 = (a184 + 2);
+ s47 = *(a185);
+ a186 = (4 * i9);
+ b15 = (a186 + syms);
+ a187 = (b15 + 2);
+ a188 = *(a187);
+ a189 = _mm_set1_epi8(a188);
+ a190 = ((__m128i *) Branchtab);
+ a191 = *(a190);
+ a192 = _mm_xor_si128(a189, a191);
+ a193 = (b15 + 3);
+ a194 = *(a193);
+ a195 = _mm_set1_epi8(a194);
+ a196 = (a190 + 2);
+ a197 = *(a196);
+ a198 = _mm_xor_si128(a195, a197);
+ t25 = _mm_avg_epu8(a192,a198);
+ a199 = ((__m128i ) t25);
+ a200 = _mm_srli_epi16(a199, 2);
+ a201 = ((__m128i ) a200);
+ t26 = _mm_and_si128(a201, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
+ , 63, 63, 63, 63, 63, 63, 63, 63
+ , 63));
+ t27 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
+ , 63, 63, 63, 63, 63, 63, 63, 63
+ , 63), t26);
+ m39 = _mm_adds_epu8(s46, t26);
+ m40 = _mm_adds_epu8(s47, t27);
+ m41 = _mm_adds_epu8(s46, t27);
+ m42 = _mm_adds_epu8(s47, t26);
+ a202 = _mm_min_epu8(m40, m39);
+ d17 = _mm_cmpeq_epi8(a202, m40);
+ a203 = _mm_min_epu8(m42, m41);
+ d18 = _mm_cmpeq_epi8(a203, m42);
+ s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17,d18));
+ a204 = ((short int *) dec);
+ a205 = (8 * i9);
+ b16 = (a204 + a205);
+ a206 = (b16 + 4);
+ *(a206) = s48;
+ s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17,d18));
+ a207 = (b16 + 5);
+ *(a207) = s49;
+ s50 = _mm_unpacklo_epi8(a202, a203);
+ s51 = _mm_unpackhi_epi8(a202, a203);
+ a208 = ((__m128i *) X);
+ *(a208) = s50;
+ a209 = (a208 + 1);
+ *(a209) = s51;
+ a210 = (a184 + 1);
+ s52 = *(a210);
+ a211 = (a184 + 3);
+ s53 = *(a211);
+ a212 = (a190 + 1);
+ a213 = *(a212);
+ a214 = _mm_xor_si128(a189, a213);
+ a215 = (a190 + 3);
+ a216 = *(a215);
+ a217 = _mm_xor_si128(a195, a216);
+ t28 = _mm_avg_epu8(a214,a217);
+ a218 = ((__m128i ) t28);
+ a219 = _mm_srli_epi16(a218, 2);
+ a220 = ((__m128i ) a219);
+ t29 = _mm_and_si128(a220, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
+ , 63, 63, 63, 63, 63, 63, 63, 63
+ , 63));
+ t30 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
+ , 63, 63, 63, 63, 63, 63, 63, 63
+ , 63), t29);
+ m43 = _mm_adds_epu8(s52, t29);
+ m44 = _mm_adds_epu8(s53, t30);
+ m45 = _mm_adds_epu8(s52, t30);
+ m46 = _mm_adds_epu8(s53, t29);
+ a221 = _mm_min_epu8(m44, m43);
+ d19 = _mm_cmpeq_epi8(a221, m44);
+ a222 = _mm_min_epu8(m46, m45);
+ d20 = _mm_cmpeq_epi8(a222, m46);
+ s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19,d20));
+ a223 = (b16 + 6);
+ *(a223) = s54;
+ s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19,d20));
+ a224 = (b16 + 7);
+ *(a224) = s55;
+ s56 = _mm_unpacklo_epi8(a221, a222);
+ s57 = _mm_unpackhi_epi8(a221, a222);
+ a225 = (a208 + 2);
+ *(a225) = s56;
+ a226 = (a208 + 3);
+ *(a226) = s57;
+ if ((((unsigned char *) X)[0]>210)) {
+ __m128i m12, m13;
+ m12 = ((__m128i *) X)[0];
+ m12 = _mm_min_epu8(m12, ((__m128i *) X)[1]);
+ m12 = _mm_min_epu8(m12, ((__m128i *) X)[2]);
+ m12 = _mm_min_epu8(m12, ((__m128i *) X)[3]);
+ __m128i m14;
+ m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
+ m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 32)), ((__m128i ) m14)));
+ m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 16)), ((__m128i ) m14)));
+ m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 8)), ((__m128i ) m14)));
+ m14 = _mm_unpacklo_epi8(m14, m14);
+ m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
+ m13 = _mm_unpacklo_epi64(m14, m14);
+ ((__m128i *) X)[0] = _mm_subs_epu8(((__m128i *) X)[0], m13);
+ ((__m128i *) X)[1] = _mm_subs_epu8(((__m128i *) X)[1], m13);
+ ((__m128i *) X)[2] = _mm_subs_epu8(((__m128i *) X)[2], m13);
+ ((__m128i *) X)[3] = _mm_subs_epu8(((__m128i *) X)[3], m13);
+ }
+ }
+
+ renormalize(X, 210);
+
+ /*int ch;
+ for(ch = 0; ch < 64; ch++) {
+ printf("%d,", X[ch]);
+ }
+ printf("\n");*/
+
+ unsigned int j;
+ for(j=0; j < (framebits + excess) % 2; ++j) {
+ int i;
+ for(i=0;i<64/2;i++){
+ BFLY(i, (((framebits+excess) >> 1) << 1) + j , syms, Y, X, (decision_t *)dec, Branchtab);
+ }
+
+
+ renormalize(Y, 210);
+
+ /*printf("\n");
+ for(ch = 0; ch < 64; ch++) {
+ printf("%d,", Y[ch]);
+ }
+ printf("\n");*/
+
+ }
+ /*skip*/
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+
+#if LV_HAVE_GENERIC
+
+static inline void
+volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y, unsigned char* X,
+ unsigned char* syms, unsigned char* dec,
+ unsigned int framebits, unsigned int excess,
+ unsigned char* Branchtab)
+{
+ int nbits = framebits + excess;
+ int NUMSTATES = 64;
+ int RENORMALIZE_THRESHOLD = 210;
+
+ int s,i;
+ for (s=0;s<nbits;s++){
+ void *tmp;
+ for(i=0;i<NUMSTATES/2;i++){
+ BFLY(i, s, syms, Y, X, (decision_t *)dec, Branchtab);
+ }
+
+ renormalize(Y, RENORMALIZE_THRESHOLD);
+
+ /// Swap pointers to old and new metrics
+ tmp = (void *)X;
+ X = Y;
+ Y = (unsigned char*)tmp;
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /*INCLUDED_volk_8u_x4_conv_k7_r2_8u_H*/
--- /dev/null
+#
+# Copyright 2011-2012,2014 Free Software Foundation, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+########################################################################
+# header file detection
+########################################################################
+include(CheckIncludeFile)
+CHECK_INCLUDE_FILE(cpuid.h HAVE_CPUID_H)
+if(HAVE_CPUID_H)
+ add_definitions(-DHAVE_CPUID_H)
+endif()
+
+CHECK_INCLUDE_FILE(intrin.h HAVE_INTRIN_H)
+if(HAVE_INTRIN_H)
+ add_definitions(-DHAVE_INTRIN_H)
+endif()
+
+CHECK_INCLUDE_FILE(fenv.h HAVE_FENV_H)
+if(HAVE_FENV_H)
+ add_definitions(-DHAVE_FENV_H)
+endif()
+
+CHECK_INCLUDE_FILE(dlfcn.h HAVE_DLFCN_H)
+if(HAVE_DLFCN_H)
+ add_definitions(-DHAVE_DLFCN_H)
+ list(APPEND volk_libraries ${CMAKE_DL_LIBS})
+endif()
+
+########################################################################
+# Setup the compiler name
+########################################################################
+set(COMPILER_NAME ${CMAKE_C_COMPILER_ID})
+if(MSVC) #its not set otherwise
+ set(COMPILER_NAME MSVC)
+endif()
+
+message(STATUS "Compiler name: ${COMPILER_NAME}")
+
+if(NOT DEFINED COMPILER_NAME)
+ message(FATAL_ERROR "COMPILER_NAME undefined. Volk build may not support this compiler.")
+endif()
+
+########################################################################
+# Special clang flag so flag checks can fail
+########################################################################
+if(COMPILER_NAME MATCHES "GNU")
+ include(CheckCXXCompilerFlag)
+ CHECK_CXX_COMPILER_FLAG("-Werror=unused-command-line-argument" HAVE_WERROR_UNUSED_CMD_LINE_ARG)
+ if(HAVE_WERROR_UNUSED_CMD_LINE_ARG)
+ set(VOLK_FLAG_CHECK_FLAGS "-Werror=unused-command-line-argument")
+ endif()
+endif()
+
+########################################################################
+# check for posix_memalign, since some OSs do not internally define
+# _XOPEN_SOURCE or _POSIX_C_SOURCE; they leave this to the user.
+########################################################################
+
+include(CheckSymbolExists)
+CHECK_SYMBOL_EXISTS(posix_memalign stdlib.h HAVE_POSIX_MEMALIGN)
+
+if(HAVE_POSIX_MEMALIGN)
+ add_definitions(-DHAVE_POSIX_MEMALIGN)
+endif(HAVE_POSIX_MEMALIGN)
+
+########################################################################
+# detect x86 flavor of CPU
+########################################################################
+if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(i.86|x86|x86_64|amd64|AMD64)$")
+ message(STATUS "x86* CPU detected")
+ set(CPU_IS_x86 TRUE)
+endif()
+
+########################################################################
+# determine passing architectures based on compile flag tests
+########################################################################
+execute_process(
+ COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B}
+ ${PROJECT_SOURCE_DIR}/gen/volk_compile_utils.py
+ --mode "arch_flags" --compiler "${COMPILER_NAME}"
+ OUTPUT_VARIABLE arch_flag_lines OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+macro(check_arch arch_name)
+ set(flags ${ARGN})
+ set(have_${arch_name} TRUE)
+ foreach(flag ${flags})
+ if (MSVC AND (${flag} STREQUAL "/arch:SSE2" OR ${flag} STREQUAL "/arch:SSE" ))
+ # SSE/SSE2 is supported in MSVC since VS 2005 but flag not available when compiling 64-bit so do not check
+ else()
+ include(CheckCXXCompilerFlag)
+ set(have_flag have${flag})
+ #make the have_flag have nice alphanum chars (just for looks/not necessary)
+ execute_process(
+ COMMAND ${PYTHON_EXECUTABLE} -c "import re; print(re.sub('\\W', '_', '${have_flag}'))"
+ OUTPUT_VARIABLE have_flag OUTPUT_STRIP_TRAILING_WHITESPACE
+ )
+ if(VOLK_FLAG_CHECK_FLAGS)
+ set(CMAKE_REQUIRED_FLAGS ${VOLK_FLAG_CHECK_FLAGS})
+ endif()
+ CHECK_CXX_COMPILER_FLAG(${flag} ${have_flag})
+ unset(CMAKE_REQUIRED_FLAGS)
+ if (NOT ${have_flag})
+ set(have_${arch_name} FALSE)
+ endif()
+ endif()
+ endforeach()
+ if (have_${arch_name})
+ list(APPEND available_archs ${arch_name})
+ endif()
+endmacro(check_arch)
+
+foreach(line ${arch_flag_lines})
+ string(REGEX REPLACE "," ";" arch_flags ${line})
+ check_arch(${arch_flags})
+endforeach(line)
+
+macro(OVERRULE_ARCH arch reason)
+ message(STATUS "${reason}, Overruled arch ${arch}")
+ list(REMOVE_ITEM available_archs ${arch})
+endmacro(OVERRULE_ARCH)
+
+########################################################################
+# eliminate AVX on if not on x86, or if the compiler does not accept
+# the xgetbv instruction, or {if not cross-compiling and the xgetbv
+# executable does not function correctly}.
+########################################################################
+set(HAVE_XGETBV 0)
+set(HAVE_AVX_CVTPI32_PS 0)
+if(CPU_IS_x86)
+ # check to see if the compiler/linker works with xgetb instruction
+ if (NOT MSVC)
+ file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_xgetbv.c "unsigned long long _xgetbv(unsigned int index) { unsigned int eax, edx; __asm__ __volatile__(\"xgetbv\" : \"=a\"(eax), \"=d\"(edx) : \"c\"(index)); return ((unsigned long long)edx << 32) | eax; } int main (void) { (void) _xgetbv(0); return (0); }")
+ else (NOT MSVC)
+ #MSVC defines an intrinsic
+ file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_xgetbv.c "#include <stdio.h> \n #include <intrin.h> \n int main() { int avxSupported = 0; \n#if (_MSC_FULL_VER >= 160040219) \nint cpuInfo[4]; __cpuid(cpuInfo, 1);\nif ((cpuInfo[2] & (1 << 27) || 0) && (cpuInfo[2] & (1 << 28) || 0)) \n{\nunsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);\n avxSupported = (xcrFeatureMask & 0x6) == 6;}\n#endif \n return 1- avxSupported; }")
+ endif(NOT MSVC)
+ execute_process(COMMAND ${CMAKE_C_COMPILER} -o
+ ${CMAKE_CURRENT_BINARY_DIR}/test_xgetbv
+ ${CMAKE_CURRENT_BINARY_DIR}/test_xgetbv.c
+ OUTPUT_QUIET ERROR_QUIET
+ RESULT_VARIABLE avx_compile_result)
+ if(NOT ${avx_compile_result} EQUAL 0)
+ OVERRULE_ARCH(avx "Compiler or linker missing xgetbv instruction")
+ elseif(NOT CROSSCOMPILE_MULTILIB)
+ execute_process(COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_xgetbv
+ OUTPUT_QUIET ERROR_QUIET
+ RESULT_VARIABLE avx_exe_result)
+ if(NOT ${avx_exe_result} EQUAL 0)
+ OVERRULE_ARCH(avx "CPU missing xgetbv")
+ else()
+ set(HAVE_XGETBV 1)
+ endif()
+ else()
+ # cross compiling and compiler/linker seems to work; assume working
+ set(HAVE_XGETBV 1)
+ endif()
+ file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/test_xgetbv
+ ${CMAKE_CURRENT_BINARY_DIR}/test_xgetbv.c)
+
+ #########################################################################
+ # eliminate AVX if cvtpi32_ps intrinsic fails like some versions of clang
+ #########################################################################
+
+ # check to see if the compiler/linker works with cvtpi32_ps instrinsic when using AVX
+ if (CMAKE_SIZEOF_VOID_P EQUAL 4)
+ file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps.c "#include <immintrin.h>\nint main (void) {__m128 __a; __m64 __b; __m128 foo = _mm_cvtpi32_ps(__a, __b); return (0); }")
+ execute_process(COMMAND ${CMAKE_C_COMPILER} -mavx -o
+ ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps
+ ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps.c
+ OUTPUT_QUIET ERROR_QUIET
+ RESULT_VARIABLE avx_compile_result)
+ if(NOT ${avx_compile_result} EQUAL 0)
+ OVERRULE_ARCH(avx "Compiler missing cvtpi32_ps instrinsic")
+ elseif(NOT CROSSCOMPILE_MULTILIB)
+ execute_process(COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps
+ OUTPUT_QUIET ERROR_QUIET
+ RESULT_VARIABLE avx_exe_result)
+ if(NOT ${avx_exe_result} EQUAL 0)
+ OVERRULE_ARCH(avx "CPU missing cvtpi32_ps")
+ else()
+ set(HAVE_AVX_CVTPI32_PS 1)
+ endif()
+ else()
+ set(HAVE_AVX_CVTPI32_PS 1)
+ endif()
+ file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps
+ ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps.c)
+ else(CMAKE_SIZEOF_VOID_P EQUAL 4)
+ # 64-bit compilations won't need this command so don't overrule AVX
+ set(HAVE_AVX_CVTPI32_PS 0)
+ endif(CMAKE_SIZEOF_VOID_P EQUAL 4)
+
+ # Disable SSE4a if Clang is less than version 3.2
+ if("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang")
+ # Figure out the version of Clang
+ if(CMAKE_VERSION VERSION_LESS "2.8.10")
+ # Exctract the Clang version from the --version string.
+ # In cmake 2.8.10, we can just use CMAKE_C_COMPILER_VERSION
+ # without having to go through these string manipulations
+ execute_process(COMMAND ${CMAKE_C_COMPILER} --version
+ OUTPUT_VARIABLE clang_version)
+ string(REGEX MATCH "[0-9].[0-9]" CMAKE_C_COMPILER_VERSION ${clang_version})
+ endif(CMAKE_VERSION VERSION_LESS "2.8.10")
+
+ if(CMAKE_C_COMPILER_VERSION VERSION_LESS "3.2")
+ OVERRULE_ARCH(sse4_a "Clang >= 3.2 required for SSE4a")
+ endif(CMAKE_C_COMPILER_VERSION VERSION_LESS "3.2")
+ endif("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang")
+
+endif(CPU_IS_x86)
+
+if(${HAVE_XGETBV})
+ add_definitions(-DHAVE_XGETBV)
+endif()
+
+if(${HAVE_AVX_CVTPI32_PS})
+ add_definitions(-DHAVE_AVX_CVTPI32_PS)
+endif()
+
+########################################################################
+# if the CPU is not x86, eliminate all Intel SIMD
+########################################################################
+
+if(NOT CPU_IS_x86)
+ OVERRULE_ARCH(3dnow "Architecture is not x86 or x86_64")
+ OVERRULE_ARCH(mmx "Architecture is not x86 or x86_64")
+ OVERRULE_ARCH(sse "Architecture is not x86 or x86_64")
+ OVERRULE_ARCH(sse2 "Architecture is not x86 or x86_64")
+ OVERRULE_ARCH(sse3 "Architecture is not x86 or x86_64")
+ OVERRULE_ARCH(ssse3 "Architecture is not x86 or x86_64")
+ OVERRULE_ARCH(sse4_a "Architecture is not x86 or x86_64")
+ OVERRULE_ARCH(sse4_1 "Architecture is not x86 or x86_64")
+ OVERRULE_ARCH(sse4_2 "Architecture is not x86 or x86_64")
+ OVERRULE_ARCH(avx "Architecture is not x86 or x86_64")
+endif(NOT CPU_IS_x86)
+
+########################################################################
+# implement overruling in the ORC case,
+# since ORC always passes flag detection
+########################################################################
+if(NOT ORC_FOUND)
+ OVERRULE_ARCH(orc "ORC support not found")
+endif()
+
+########################################################################
+# implement overruling in the non-multilib case
+# this makes things work when both -m32 and -m64 pass
+########################################################################
+if(NOT CROSSCOMPILE_MULTILIB AND CPU_IS_x86)
+ include(CheckTypeSize)
+ check_type_size("void*[8]" SIZEOF_CPU BUILTIN_TYPES_ONLY)
+ if (${SIZEOF_CPU} EQUAL 64)
+ OVERRULE_ARCH(32 "CPU width is 64 bits")
+ endif()
+ if (${SIZEOF_CPU} EQUAL 32)
+ OVERRULE_ARCH(64 "CPU width is 32 bits")
+ endif()
+
+ #MSVC 64 bit does not have MMX, overrule it
+ if (${SIZEOF_CPU} EQUAL 64 AND MSVC)
+ OVERRULE_ARCH(mmx "No MMX for Win64")
+ endif()
+
+endif()
+
+########################################################################
+# done overrules! print the result
+########################################################################
+message(STATUS "Available architectures: ${available_archs}")
+
+########################################################################
+# determine available machines given the available architectures
+########################################################################
+execute_process(
+ COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B}
+ ${PROJECT_SOURCE_DIR}/gen/volk_compile_utils.py
+ --mode "machines" --archs "${available_archs}"
+ OUTPUT_VARIABLE available_machines OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+########################################################################
+# Implement machine overruling for redundant machines:
+# A machine is redundant when expansion rules occur,
+# and the arch superset passes configuration checks.
+# When this occurs, eliminate the redundant machines
+# to avoid unnecessary compilation of subset machines.
+########################################################################
+foreach(arch mmx orc 64 32)
+ foreach(machine_name ${available_machines})
+ string(REPLACE "_${arch}" "" machine_name_no_arch ${machine_name})
+ if (${machine_name} STREQUAL ${machine_name_no_arch})
+ else()
+ list(REMOVE_ITEM available_machines ${machine_name_no_arch})
+ endif()
+ endforeach(machine_name)
+endforeach(arch)
+
+########################################################################
+# done overrules! print the result
+########################################################################
+message(STATUS "Available machines: ${available_machines}")
+
+########################################################################
+# Create rules to run the volk generator
+########################################################################
+
+#dependencies are all python, xml, and header implementation files
+file(GLOB xml_files ${PROJECT_SOURCE_DIR}/gen/*.xml)
+file(GLOB py_files ${PROJECT_SOURCE_DIR}/gen/*.py)
+file(GLOB h_files ${PROJECT_SOURCE_DIR}/kernels/volk/*.h)
+
+macro(gen_template tmpl output)
+ list(APPEND volk_gen_sources ${output})
+ add_custom_command(
+ OUTPUT ${output}
+ DEPENDS ${xml_files} ${py_files} ${h_files} ${tmpl}
+ COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B}
+ ${PROJECT_SOURCE_DIR}/gen/volk_tmpl_utils.py
+ --input ${tmpl} --output ${output} ${ARGN}
+ )
+endmacro(gen_template)
+
+make_directory(${PROJECT_BINARY_DIR}/include/volk)
+
+gen_template(${PROJECT_SOURCE_DIR}/tmpl/volk.tmpl.h ${PROJECT_BINARY_DIR}/include/volk/volk.h)
+gen_template(${PROJECT_SOURCE_DIR}/tmpl/volk.tmpl.c ${PROJECT_BINARY_DIR}/lib/volk.c)
+gen_template(${PROJECT_SOURCE_DIR}/tmpl/volk_typedefs.tmpl.h ${PROJECT_BINARY_DIR}/include/volk/volk_typedefs.h)
+gen_template(${PROJECT_SOURCE_DIR}/tmpl/volk_cpu.tmpl.h ${PROJECT_BINARY_DIR}/include/volk/volk_cpu.h)
+gen_template(${PROJECT_SOURCE_DIR}/tmpl/volk_cpu.tmpl.c ${PROJECT_BINARY_DIR}/lib/volk_cpu.c)
+gen_template(${PROJECT_SOURCE_DIR}/tmpl/volk_config_fixed.tmpl.h ${PROJECT_BINARY_DIR}/include/volk/volk_config_fixed.h)
+gen_template(${PROJECT_SOURCE_DIR}/tmpl/volk_machines.tmpl.h ${PROJECT_BINARY_DIR}/lib/volk_machines.h)
+gen_template(${PROJECT_SOURCE_DIR}/tmpl/volk_machines.tmpl.c ${PROJECT_BINARY_DIR}/lib/volk_machines.c)
+
+set(BASE_CFLAGS NONE)
+string(TOUPPER ${CMAKE_BUILD_TYPE} CBTU)
+message(STATUS "BUILD TYPE = ${CBTU}")
+message(STATUS "Base cflags = ${CMAKE_C_FLAGS_${CBTU}} ${CMAKE_C_FLAGS}")
+set(COMPILER_INFO "")
+if(MSVC)
+ if(MSVC90) #Visual Studio 9
+ set(cmake_c_compiler_version "Microsoft Visual Studio 9.0")
+ elseif(MSVC10) #Visual Studio 10
+ set(cmake_c_compiler_version "Microsoft Visual Studio 10.0")
+ elseif(MSVC11) #Visual Studio 11
+ set(cmake_c_compiler_version "Microsoft Visual Studio 11.0")
+ elseif(MSVC12) #Visual Studio 12
+ SET(cmake_c_compiler_version "Microsoft Visual Studio 12.0")
+ elseif(MSVC14) #Visual Studio 14
+ SET(cmake_c_compiler_version "Microsoft Visual Studio 14.0")
+ endif()
+else()
+ execute_process(COMMAND ${CMAKE_C_COMPILER} --version
+ OUTPUT_VARIABLE cmake_c_compiler_version)
+endif(MSVC)
+set(COMPILER_INFO "${CMAKE_C_COMPILER}:::${CMAKE_C_FLAGS_${GRCBTU}} ${CMAKE_C_FLAGS}\n${CMAKE_CXX_COMPILER}:::${CMAKE_CXX_FLAGS_${GRCBTU}} ${CMAKE_CXX_FLAGS}\n" )
+
+foreach(machine_name ${available_machines})
+ #generate machine source
+ set(machine_source ${CMAKE_CURRENT_BINARY_DIR}/volk_machine_${machine_name}.c)
+ gen_template(${PROJECT_SOURCE_DIR}/tmpl/volk_machine_xxx.tmpl.c ${machine_source} ${machine_name})
+
+ #determine machine flags
+ execute_process(
+ COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B}
+ ${PROJECT_SOURCE_DIR}/gen/volk_compile_utils.py
+ --mode "machine_flags" --machine "${machine_name}" --compiler "${COMPILER_NAME}"
+ OUTPUT_VARIABLE ${machine_name}_flags OUTPUT_STRIP_TRAILING_WHITESPACE
+ )
+ MESSAGE(STATUS "BUILD INFO ::: ${machine_name} ::: ${COMPILER_NAME} ::: ${CMAKE_C_FLAGS_${CBTU}} ${CMAKE_C_FLAGS} ${${machine_name}_flags}")
+ set(COMPILER_INFO "${COMPILER_INFO}${machine_name}:::${COMPILER_NAME}:::${CMAKE_C_FLAGS_${CBTU}} ${CMAKE_C_FLAGS} ${${machine_name}_flags}\n" )
+ if(${machine_name}_flags AND NOT MSVC)
+ set_source_files_properties(${machine_source} PROPERTIES COMPILE_FLAGS "${${machine_name}_flags}")
+ endif()
+
+ #add to available machine defs
+ string(TOUPPER LV_MACHINE_${machine_name} machine_def)
+ list(APPEND machine_defs ${machine_def})
+endforeach(machine_name)
+
+# Convert to a C string to compile and display properly
+string(STRIP "${cmake_c_compiler_version}" cmake_c_compiler_version)
+string(STRIP ${COMPILER_INFO} COMPILER_INFO)
+MESSAGE(STATUS "Compiler Version: ${cmake_c_compiler_version}")
+string(REPLACE "\n" " \\n" cmake_c_compiler_version ${cmake_c_compiler_version})
+string(REPLACE "\n" " \\n" COMPILER_INFO ${COMPILER_INFO})
+
+########################################################################
+# Set local include directories first
+########################################################################
+include_directories(
+ ${PROJECT_BINARY_DIR}/include
+ ${PROJECT_SOURCE_DIR}/include
+ ${PROJECT_SOURCE_DIR}/kernels
+ ${CMAKE_CURRENT_BINARY_DIR}
+ ${CMAKE_CURRENT_SOURCE_DIR}
+)
+
+########################################################################
+# Handle ASM support
+# on by default, but let users turn it off
+########################################################################
+if(${CMAKE_VERSION} VERSION_GREATER "2.8.9")
+ set(ASM_ARCHS_AVAILABLE "neon")
+
+ set(FULL_C_FLAGS "${CMAKE_C_FLAGS}" "${CMAKE_CXX_COMPILER_ARG1}")
+
+ # sort through a list of all architectures we have ASM for
+ # if we find one that matches our current system architecture
+ # set up the assembler flags and include the source files
+ foreach(ARCH ${ASM_ARCHS_AVAILABLE})
+ string(REGEX MATCH "${ARCH}" ASM_ARCH "${available_archs}")
+ if( ASM_ARCH STREQUAL "neon" )
+ message(STATUS "---- Adding ASM files") # we always use ATT syntax
+ message(STATUS "-- Detected neon architecture; enabling ASM")
+ # setup architecture specific assembler flags
+ set(ARCH_ASM_FLAGS "-mfpu=neon -g")
+ # then add the files
+ include_directories(${PROJECT_SOURCE_DIR}/kernels/volk/asm/neon)
+ file(GLOB asm_files ${PROJECT_SOURCE_DIR}/kernels/volk/asm/neon/*.s)
+ foreach(asm_file ${asm_files})
+ list(APPEND volk_sources ${asm_file})
+ message(STATUS "Adding source file: ${asm_file}")
+ endforeach(asm_file)
+ endif()
+ enable_language(ASM)
+ set(CMAKE_ASM_FLAGS ${ARCH_ASM_FLAGS})
+ message(STATUS "c flags: ${FULL_C_FLAGS}")
+ message(STATUS "asm flags: ${CMAKE_ASM_FLAGS}")
+ endforeach(ARCH)
+
+else(${CMAKE_VERSION} VERSION_GREATER "2.8.9")
+ message(STATUS "Not enabling ASM support. CMake >= 2.8.10 required.")
+ foreach(machine_name ${available_machines})
+ string(REGEX MATCH "neon" NEON_MACHINE ${machine_name})
+ if( NEON_MACHINE STREQUAL "neon")
+ message(FATAL_ERROR "CMake >= 2.8.10 is required for ARM NEON support")
+ endif()
+ endforeach()
+endif(${CMAKE_VERSION} VERSION_GREATER "2.8.9")
+
+########################################################################
+# Handle orc support
+########################################################################
+if(ORC_FOUND)
+ #setup orc library usage
+ include_directories(${ORC_INCLUDE_DIRS})
+ link_directories(${ORC_LIBRARY_DIRS})
+ list(APPEND volk_libraries ${ORC_LIBRARIES})
+
+ #setup orc functions
+ file(GLOB orc_files ${PROJECT_SOURCE_DIR}/kernels/volk/asm/orc/*.orc)
+ foreach(orc_file ${orc_files})
+
+ #extract the name for the generated c source from the orc file
+ get_filename_component(orc_file_name_we ${orc_file} NAME_WE)
+ set(orcc_gen ${CMAKE_CURRENT_BINARY_DIR}/${orc_file_name_we}.c)
+
+ #create a rule to generate the source and add to the list of sources
+ add_custom_command(
+ COMMAND ${ORCC_EXECUTABLE} --include math.h --implementation -o ${orcc_gen} ${orc_file}
+ DEPENDS ${orc_file} OUTPUT ${orcc_gen}
+ )
+ list(APPEND volk_sources ${orcc_gen})
+
+ endforeach(orc_file)
+else()
+ message(STATUS "Did not find liborc and orcc, disabling orc support...")
+endif()
+
+########################################################################
+# Handle the generated constants
+########################################################################
+
+message(STATUS "Loading version ${VERSION} into constants...")
+
+#double escape for windows backslash path separators
+string(REPLACE "\\" "\\\\" prefix "${prefix}")
+
+configure_file(
+ ${CMAKE_CURRENT_SOURCE_DIR}/constants.c.in
+ ${CMAKE_CURRENT_BINARY_DIR}/constants.c
+@ONLY)
+
+list(APPEND volk_sources ${CMAKE_CURRENT_BINARY_DIR}/constants.c)
+
+########################################################################
+# Setup the volk sources list and library
+########################################################################
+if(NOT WIN32)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden")
+endif()
+
+list(APPEND volk_sources
+ ${CMAKE_CURRENT_SOURCE_DIR}/volk_prefs.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/volk_rank_archs.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/volk_malloc.c
+ ${volk_gen_sources}
+)
+
+#set the machine definitions where applicable
+set_source_files_properties(
+ ${CMAKE_CURRENT_BINARY_DIR}/volk.c
+ ${CMAKE_CURRENT_BINARY_DIR}/volk_machines.c
+PROPERTIES COMPILE_DEFINITIONS "${machine_defs}")
+
+if(MSVC)
+ #add compatibility includes for stdint types
+ include_directories(${PROJECT_SOURCE_DIR}/cmake/msvc)
+ add_definitions(-DHAVE_CONFIG_H)
+ #compile the sources as C++ due to the lack of complex.h under MSVC
+ set_source_files_properties(${volk_sources} PROPERTIES LANGUAGE CXX)
+endif()
+
+#Use object library for faster overall build in newer versions of cmake
+if(CMAKE_VERSION VERSION_GREATER "2.8.7")
+ #Create a volk object library (requires cmake >= 2.8.8)
+ add_library(volk_obj OBJECT ${volk_sources})
+
+ #Add dynamic library
+ add_library(volk SHARED $<TARGET_OBJECTS:volk_obj>)
+ target_link_libraries(volk ${volk_libraries})
+
+ #Configure target properties
+ set_target_properties(volk_obj PROPERTIES COMPILE_FLAGS "-fPIC")
+ set_target_properties(volk PROPERTIES SOVERSION ${LIBVER})
+ set_target_properties(volk PROPERTIES DEFINE_SYMBOL "volk_EXPORTS")
+
+ #Install locations
+ install(TARGETS volk
+ LIBRARY DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_runtime" # .so file
+ ARCHIVE DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_devel" # .lib file
+ RUNTIME DESTINATION bin COMPONENT "volk_runtime" # .dll file
+ )
+
+ #Configure static library
+ if(ENABLE_STATIC_LIBS)
+ add_library(volk_static STATIC $<TARGET_OBJECTS:volk_obj>)
+
+ set_target_properties(volk_static PROPERTIES OUTPUT_NAME volk)
+
+ install(TARGETS volk_static
+ ARCHIVE DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_devel"
+ )
+ endif(ENABLE_STATIC_LIBS)
+
+#Older cmake versions (slower to build when building dynamic/static libs)
+else()
+ #create the volk runtime library
+ add_library(volk SHARED ${volk_sources})
+ target_link_libraries(volk ${volk_libraries})
+ set_target_properties(volk PROPERTIES SOVERSION ${LIBVER})
+ set_target_properties(volk PROPERTIES DEFINE_SYMBOL "volk_EXPORTS")
+
+ install(TARGETS volk
+ LIBRARY DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_runtime" # .so file
+ ARCHIVE DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_devel" # .lib file
+ RUNTIME DESTINATION bin COMPONENT "volk_runtime" # .dll file
+ )
+
+ if(ENABLE_STATIC_LIBS)
+ add_library(volk_static STATIC ${volk_sources})
+
+ if(NOT WIN32)
+ set_target_properties(volk_static
+ PROPERTIES OUTPUT_NAME volk)
+ endif(NOT WIN32)
+
+ install(TARGETS volk_static
+ ARCHIVE DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_devel" # .lib file
+ )
+ endif(ENABLE_STATIC_LIBS)
+
+endif(CMAKE_VERSION VERSION_GREATER "2.8.7")
+########################################################################
+# Build the QA test application
+########################################################################
+if(ENABLE_TESTING)
+
+ #include Boost headers
+ include_directories(${Boost_INCLUDE_DIRS})
+ make_directory(${CMAKE_CURRENT_BINARY_DIR}/.unittest)
+ set_source_files_properties(
+ ${CMAKE_CURRENT_SOURCE_DIR}/testqa.cc PROPERTIES
+ COMPILE_DEFINITIONS "BOOST_TEST_DYN_LINK;BOOST_TEST_MAIN"
+ )
+
+ include(VolkAddTest)
+ VOLK_ADD_TEST(test_all
+ SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/testqa.cc
+ ${CMAKE_CURRENT_SOURCE_DIR}/qa_utils.cc
+ TARGET_DEPS volk
+ )
+
+endif(ENABLE_TESTING)
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2013, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#if HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <volk/constants.h>
+
+char*
+volk_prefix()
+{
+ const char *prefix = getenv("VOLK_PREFIX");
+ if (prefix != NULL) return (char *)prefix;
+ return "@prefix@";
+}
+
+char*
+volk_version()
+{
+ return "@VERSION@";
+}
+
+char*
+volk_c_compiler()
+{
+ return "@cmake_c_compiler_version@";
+}
+
+char*
+volk_compiler_flags()
+{
+ return "@COMPILER_INFO@";
+}
+
+char*
+volk_available_machines()
+{
+ return "@available_machines@";
+}
--- /dev/null
+#include "qa_utils.h"
+
+#include <volk/volk.h>
+
+#include <boost/assign/list_of.hpp>
+#include <vector>
+
+// macros for initializing volk_test_case_t. Maccros are needed to generate
+// function names of the pattern kernel_name_*
+
+// for puppets we need to get all the func_variants for the puppet and just
+// keep track of the actual function name to write to results
+#define VOLK_INIT_PUPP(func, puppet_master_func, test_params)\
+ volk_test_case_t(func##_get_func_desc(), (void(*)())func##_manual, std::string(#func),\
+ std::string(#puppet_master_func), test_params)
+
+#define VOLK_INIT_TEST(func, test_params)\
+ volk_test_case_t(func##_get_func_desc(), (void(*)())func##_manual, std::string(#func),\
+ test_params)
+
+std::vector<volk_test_case_t> init_test_list(volk_test_params_t test_params)
+{
+
+ // Some kernels need a lower tolerance
+ volk_test_params_t test_params_inacc = volk_test_params_t(1e-2, test_params.scalar(),
+ test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
+ volk_test_params_t test_params_inacc_tenth = volk_test_params_t(1e-1, test_params.scalar(),
+ test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
+ volk_test_params_t test_params_int1 = volk_test_params_t(1, test_params.scalar(),
+ test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
+
+ std::vector<volk_test_case_t> test_cases = boost::assign::list_of
+ (VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params))
+
+ (VOLK_INIT_PUPP(volk_16u_byteswappuppet_16u, volk_16u_byteswap, test_params))
+ (VOLK_INIT_PUPP(volk_32u_byteswappuppet_32u, volk_32u_byteswap, test_params))
+ (VOLK_INIT_PUPP(volk_32u_popcntpuppet_32u, volk_32u_popcnt_32u, test_params))
+ (VOLK_INIT_PUPP(volk_64u_byteswappuppet_64u, volk_64u_byteswap, test_params))
+ (VOLK_INIT_PUPP(volk_32fc_s32fc_rotatorpuppet_32fc, volk_32fc_s32fc_x2_rotator_32fc, test_params))
+ (VOLK_INIT_PUPP(volk_8u_conv_k7_r2puppet_8u, volk_8u_x4_conv_k7_r2_8u, volk_test_params_t(0, test_params.scalar(), test_params.vlen(), test_params.iter()/10, test_params.benchmark_mode(), test_params.kernel_regex())))
+ (VOLK_INIT_PUPP(volk_32f_x2_fm_detectpuppet_32f, volk_32f_s32f_32f_fm_detect_32f, test_params))
+ (VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_real_32f, test_params))
+ (VOLK_INIT_TEST(volk_16ic_deinterleave_real_8i, test_params))
+ (VOLK_INIT_TEST(volk_16ic_deinterleave_16i_x2, test_params))
+ (VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_32f_x2, test_params))
+ (VOLK_INIT_TEST(volk_16ic_deinterleave_real_16i, test_params))
+ (VOLK_INIT_TEST(volk_16ic_magnitude_16i, test_params_int1))
+ (VOLK_INIT_TEST(volk_16ic_s32f_magnitude_32f, test_params))
+ (VOLK_INIT_TEST(volk_16ic_convert_32fc, test_params))
+ (VOLK_INIT_TEST(volk_16ic_x2_multiply_16ic, test_params))
+ (VOLK_INIT_TEST(volk_16ic_x2_dot_prod_16ic, test_params))
+ (VOLK_INIT_TEST(volk_16i_s32f_convert_32f, test_params))
+ (VOLK_INIT_TEST(volk_16i_convert_8i, test_params))
+ (VOLK_INIT_TEST(volk_16i_32fc_dot_prod_32fc, test_params_inacc))
+ (VOLK_INIT_TEST(volk_32f_accumulator_s32f, test_params_inacc))
+ (VOLK_INIT_TEST(volk_32f_x2_add_32f, test_params))
+ (VOLK_INIT_TEST(volk_32f_index_max_16u, test_params))
+ (VOLK_INIT_TEST(volk_32f_index_max_32u, test_params))
+ (VOLK_INIT_TEST(volk_32fc_32f_multiply_32fc, test_params))
+ (VOLK_INIT_TEST(volk_32f_log2_32f, volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
+ (VOLK_INIT_TEST(volk_32f_expfast_32f, volk_test_params_t(1e-1, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
+ (VOLK_INIT_TEST(volk_32f_x2_pow_32f, volk_test_params_t(1e-2, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
+ (VOLK_INIT_TEST(volk_32f_sin_32f, test_params_inacc))
+ (VOLK_INIT_TEST(volk_32f_cos_32f, test_params_inacc))
+ (VOLK_INIT_TEST(volk_32f_tan_32f, test_params_inacc))
+ (VOLK_INIT_TEST(volk_32f_atan_32f, test_params_inacc))
+ (VOLK_INIT_TEST(volk_32f_asin_32f, test_params_inacc))
+ (VOLK_INIT_TEST(volk_32f_acos_32f, test_params_inacc))
+ (VOLK_INIT_TEST(volk_32fc_s32f_power_32fc, test_params))
+ (VOLK_INIT_TEST(volk_32f_s32f_calc_spectral_noise_floor_32f, test_params_inacc))
+ (VOLK_INIT_TEST(volk_32fc_s32f_atan2_32f, test_params))
+ (VOLK_INIT_TEST(volk_32fc_x2_conjugate_dot_prod_32fc, test_params_inacc))
+ (VOLK_INIT_TEST(volk_32fc_deinterleave_32f_x2, test_params))
+ (VOLK_INIT_TEST(volk_32fc_deinterleave_64f_x2, test_params))
+ (VOLK_INIT_TEST(volk_32fc_s32f_deinterleave_real_16i, test_params))
+ (VOLK_INIT_TEST(volk_32fc_deinterleave_imag_32f, test_params))
+ (VOLK_INIT_TEST(volk_32fc_deinterleave_real_32f, test_params))
+ (VOLK_INIT_TEST(volk_32fc_deinterleave_real_64f, test_params))
+ (VOLK_INIT_TEST(volk_32fc_x2_dot_prod_32fc, test_params_inacc))
+ (VOLK_INIT_TEST(volk_32fc_32f_dot_prod_32fc, test_params_inacc))
+ (VOLK_INIT_TEST(volk_32fc_index_max_16u, volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
+ (VOLK_INIT_TEST(volk_32fc_index_max_32u, volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
+ (VOLK_INIT_TEST(volk_32fc_s32f_magnitude_16i, test_params_int1))
+ (VOLK_INIT_TEST(volk_32fc_magnitude_32f, test_params_inacc_tenth))
+ (VOLK_INIT_TEST(volk_32fc_magnitude_squared_32f, test_params))
+ (VOLK_INIT_TEST(volk_32fc_x2_multiply_32fc, test_params))
+ (VOLK_INIT_TEST(volk_32fc_x2_multiply_conjugate_32fc, test_params))
+ (VOLK_INIT_TEST(volk_32fc_x2_divide_32fc, test_params))
+ (VOLK_INIT_TEST(volk_32fc_conjugate_32fc, test_params))
+ (VOLK_INIT_TEST(volk_32f_s32f_convert_16i, test_params))
+ (VOLK_INIT_TEST(volk_32f_s32f_convert_32i, volk_test_params_t(1, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
+ (VOLK_INIT_TEST(volk_32f_convert_64f, test_params))
+ (VOLK_INIT_TEST(volk_32f_s32f_convert_8i, volk_test_params_t(1, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
+ (VOLK_INIT_TEST(volk_32fc_convert_16ic, test_params))
+ (VOLK_INIT_TEST(volk_32fc_s32f_power_spectrum_32f, test_params))
+ (VOLK_INIT_TEST(volk_32fc_x2_square_dist_32f, test_params))
+ (VOLK_INIT_TEST(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, test_params))
+ (VOLK_INIT_TEST(volk_32f_x2_divide_32f, test_params))
+ (VOLK_INIT_TEST(volk_32f_x2_dot_prod_32f, test_params_inacc))
+ (VOLK_INIT_TEST(volk_32f_x2_s32f_interleave_16ic, volk_test_params_t(1, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
+ (VOLK_INIT_TEST(volk_32f_x2_interleave_32fc, test_params))
+ (VOLK_INIT_TEST(volk_32f_x2_max_32f, test_params))
+ (VOLK_INIT_TEST(volk_32f_x2_min_32f, test_params))
+ (VOLK_INIT_TEST(volk_32f_x2_multiply_32f, test_params))
+ (VOLK_INIT_TEST(volk_32f_s32f_normalize, test_params))
+ (VOLK_INIT_TEST(volk_32f_s32f_power_32f, test_params))
+ (VOLK_INIT_TEST(volk_32f_sqrt_32f, test_params_inacc))
+ (VOLK_INIT_TEST(volk_32f_s32f_stddev_32f, test_params_inacc))
+ (VOLK_INIT_TEST(volk_32f_stddev_and_mean_32f_x2, test_params_inacc))
+ (VOLK_INIT_TEST(volk_32f_x2_subtract_32f, test_params))
+ (VOLK_INIT_TEST(volk_32f_x3_sum_of_poly_32f, test_params_inacc))
+ (VOLK_INIT_TEST(volk_32i_x2_and_32i, test_params))
+ (VOLK_INIT_TEST(volk_32i_s32f_convert_32f, test_params))
+ (VOLK_INIT_TEST(volk_32i_x2_or_32i, test_params))
+ (VOLK_INIT_TEST(volk_32f_x2_dot_prod_16i, test_params))
+ (VOLK_INIT_TEST(volk_64f_convert_32f, test_params))
+ (VOLK_INIT_TEST(volk_64f_x2_max_64f, test_params))
+ (VOLK_INIT_TEST(volk_64f_x2_min_64f, test_params))
+ (VOLK_INIT_TEST(volk_8ic_deinterleave_16i_x2, test_params))
+ (VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_32f_x2, test_params))
+ (VOLK_INIT_TEST(volk_8ic_deinterleave_real_16i, test_params))
+ (VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_real_32f, test_params))
+ (VOLK_INIT_TEST(volk_8ic_deinterleave_real_8i, test_params))
+ (VOLK_INIT_TEST(volk_8ic_x2_multiply_conjugate_16ic, test_params))
+ (VOLK_INIT_TEST(volk_8ic_x2_s32f_multiply_conjugate_32fc, test_params))
+ (VOLK_INIT_TEST(volk_8i_convert_16i, test_params))
+ (VOLK_INIT_TEST(volk_8i_s32f_convert_32f, test_params))
+ (VOLK_INIT_TEST(volk_32fc_s32fc_multiply_32fc, test_params))
+ (VOLK_INIT_TEST(volk_32f_s32f_multiply_32f, test_params))
+ (VOLK_INIT_TEST(volk_32f_binary_slicer_32i, test_params))
+ (VOLK_INIT_TEST(volk_32f_binary_slicer_8i, test_params))
+ (VOLK_INIT_TEST(volk_32f_tanh_32f, test_params_inacc))
+ (VOLK_INIT_PUPP(volk_8u_x3_encodepolarpuppet_8u, volk_8u_x3_encodepolar_8u_x2, test_params))
+ (VOLK_INIT_PUPP(volk_32f_8u_polarbutterflypuppet_32f, volk_32f_8u_polarbutterfly_32f, test_params))
+ // no one uses these, so don't test them
+ //VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex);
+ //VOLK_PROFILE(volk_16i_branch_4_state_8, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex);
+ //VOLK_PROFILE(volk_16i_max_star_16i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
+ //VOLK_PROFILE(volk_16i_max_star_horizontal_16i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
+ //VOLK_PROFILE(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
+ //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
+ // we need a puppet for this one
+ //(VOLK_INIT_TEST(volk_32fc_s32f_x2_power_spectral_density_32f, test_params))
+
+ ;
+
+
+ return test_cases;
+}
--- /dev/null
+#include "qa_utils.h"
+
+#include <boost/foreach.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/lexical_cast.hpp>
+#include <boost/typeof/typeof.hpp>
+#include <boost/type_traits.hpp>
+
+#include <iostream>
+#include <cstring>
+#include <fstream>
+#include <vector>
+#include <map>
+#include <list>
+#include <ctime>
+#include <cmath>
+#include <limits>
+
+#include <volk/volk.h>
+#include <volk/volk_cpu.h>
+#include <volk/volk_common.h>
+#include <volk/volk_malloc.h>
+
+float uniform() {
+ return 2.0f * ((float) rand() / RAND_MAX - 0.5f); // uniformly (-1, 1)
+}
+
+template <class t>
+void random_floats (t *buf, unsigned n)
+{
+ for (unsigned i = 0; i < n; i++)
+ buf[i] = uniform ();
+}
+
+void load_random_data(void *data, volk_type_t type, unsigned int n) {
+ if(type.is_complex) n *= 2;
+ if(type.is_float) {
+ if(type.size == 8) random_floats<double>((double *)data, n);
+ else random_floats<float>((float *)data, n);
+ } else {
+ float int_max = float(uint64_t(2) << (type.size*8));
+ if(type.is_signed) int_max /= 2.0;
+ for(unsigned int i=0; i<n; i++) {
+ float scaled_rand = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * int_max;
+ //man i really don't know how to do this in a more clever way, you have to cast down at some point
+ switch(type.size) {
+ case 8:
+ if(type.is_signed) ((int64_t *)data)[i] = (int64_t) scaled_rand;
+ else ((uint64_t *)data)[i] = (uint64_t) scaled_rand;
+ break;
+ case 4:
+ if(type.is_signed) ((int32_t *)data)[i] = (int32_t) scaled_rand;
+ else ((uint32_t *)data)[i] = (uint32_t) scaled_rand;
+ break;
+ case 2:
+ if(type.is_signed) ((int16_t *)data)[i] = (int16_t)((int16_t) scaled_rand % 8);
+ else ((uint16_t *)data)[i] = (uint16_t) ((int16_t) scaled_rand % 8);
+ break;
+ case 1:
+ if(type.is_signed) ((int8_t *)data)[i] = (int8_t) scaled_rand;
+ else ((uint8_t *)data)[i] = (uint8_t) scaled_rand;
+ break;
+ default:
+ throw "load_random_data: no support for data size > 8 or < 1"; //no shenanigans here
+ }
+ }
+ }
+}
+
+static std::vector<std::string> get_arch_list(volk_func_desc_t desc) {
+ std::vector<std::string> archlist;
+
+ for(size_t i = 0; i < desc.n_impls; i++) {
+ archlist.push_back(std::string(desc.impl_names[i]));
+ }
+
+ return archlist;
+}
+
+volk_type_t volk_type_from_string(std::string name) {
+ volk_type_t type;
+ type.is_float = false;
+ type.is_scalar = false;
+ type.is_complex = false;
+ type.is_signed = false;
+ type.size = 0;
+ type.str = name;
+
+ if(name.size() < 2) {
+ throw std::string("name too short to be a datatype");
+ }
+
+ //is it a scalar?
+ if(name[0] == 's') {
+ type.is_scalar = true;
+ name = name.substr(1, name.size()-1);
+ }
+
+ //get the data size
+ size_t last_size_pos = name.find_last_of("0123456789");
+ if(last_size_pos == std::string::npos) {
+ throw std::string("no size spec in type ").append(name);
+ }
+ //will throw if malformed
+ int size = boost::lexical_cast<int>(name.substr(0, last_size_pos+1));
+
+ assert(((size % 8) == 0) && (size <= 64) && (size != 0));
+ type.size = size/8; //in bytes
+
+ for(size_t i=last_size_pos+1; i < name.size(); i++) {
+ switch (name[i]) {
+ case 'f':
+ type.is_float = true;
+ break;
+ case 'i':
+ type.is_signed = true;
+ break;
+ case 'c':
+ type.is_complex = true;
+ break;
+ case 'u':
+ type.is_signed = false;
+ break;
+ default:
+ throw;
+ }
+ }
+
+ return type;
+}
+
+static void get_signatures_from_name(std::vector<volk_type_t> &inputsig,
+ std::vector<volk_type_t> &outputsig,
+ std::string name) {
+ boost::char_separator<char> sep("_");
+ boost::tokenizer<boost::char_separator<char> > tok(name, sep);
+ std::vector<std::string> toked;
+ tok.assign(name);
+ toked.assign(tok.begin(), tok.end());
+
+ assert(toked[0] == "volk");
+ toked.erase(toked.begin());
+
+ //ok. we're assuming a string in the form
+ //(sig)_(multiplier-opt)_..._(name)_(sig)_(multiplier-opt)_..._(alignment)
+
+ enum { SIDE_INPUT, SIDE_NAME, SIDE_OUTPUT } side = SIDE_INPUT;
+ std::string fn_name;
+ volk_type_t type;
+ BOOST_FOREACH(std::string token, toked) {
+ try {
+ type = volk_type_from_string(token);
+ if(side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name...
+
+ if(side == SIDE_INPUT) inputsig.push_back(type);
+ else outputsig.push_back(type);
+ } catch (...){
+ if(token[0] == 'x' && (token.size() > 1) && (token[1] > '0' || token[1] < '9')) { //it's a multiplier
+ if(side == SIDE_INPUT) assert(inputsig.size() > 0);
+ else assert(outputsig.size() > 0);
+ int multiplier = boost::lexical_cast<int>(token.substr(1, token.size()-1)); //will throw if invalid
+ for(int i=1; i<multiplier; i++) {
+ if(side == SIDE_INPUT) inputsig.push_back(inputsig.back());
+ else outputsig.push_back(outputsig.back());
+ }
+ }
+ else if(side == SIDE_INPUT) { //it's the function name, at least it better be
+ side = SIDE_NAME;
+ fn_name.append("_");
+ fn_name.append(token);
+ }
+ else if(side == SIDE_OUTPUT) {
+ if(token != toked.back()) throw; //the last token in the name is the alignment
+ }
+ }
+ }
+ //we don't need an output signature (some fn's operate on the input data, "in place"), but we do need at least one input!
+ assert(inputsig.size() != 0);
+
+}
+
+inline void run_cast_test1(volk_fn_1arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], vlen, arch.c_str());
+}
+
+inline void run_cast_test2(volk_fn_2arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], vlen, arch.c_str());
+}
+
+inline void run_cast_test3(volk_fn_3arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str());
+}
+
+inline void run_cast_test4(volk_fn_4arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str());
+}
+
+inline void run_cast_test1_s32f(volk_fn_1arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test2_s32f(volk_fn_2arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test3_s32f(volk_fn_3arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test1_s32fc(volk_fn_1arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test2_s32fc(volk_fn_2arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test3_s32fc(volk_fn_3arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+}
+
+template <class t>
+bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) {
+ bool fail = false;
+ int print_max_errs = 10;
+ for(unsigned int i=0; i<vlen; i++) {
+ // for very small numbers we'll see round off errors due to limited
+ // precision. So a special test case...
+ if(fabs(((t *)(in1))[i]) < 1e-30) {
+ if( fabs( ((t *)(in2))[i] ) > tol )
+ {
+ fail=true;
+ if(print_max_errs-- > 0) {
+ std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]);
+ std::cout << " tolerance was: " << tol << std::endl;
+ }
+ }
+ }
+ // the primary test is the percent different greater than given tol
+ else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/fabs(((t *)in1)[i]) > tol) {
+ fail=true;
+ if(print_max_errs-- > 0) {
+ std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]);
+ std::cout << " tolerance was: " << tol << std::endl;
+ }
+ }
+ }
+
+ return fail;
+}
+
+template <class t>
+bool ccompare(t *in1, t *in2, unsigned int vlen, float tol) {
+ bool fail = false;
+ int print_max_errs = 10;
+ for(unsigned int i=0; i<2*vlen; i+=2) {
+ t diff[2] = { in1[i] - in2[i], in1[i+1] - in2[i+1] };
+ t err = std::sqrt(diff[0] * diff[0] + diff[1] * diff[1]);
+ t norm = std::sqrt(in1[i] * in1[i] + in1[i+1] * in1[i+1]);
+
+ // for very small numbers we'll see round off errors due to limited
+ // precision. So a special test case...
+ if (norm < 1e-30) {
+ if (err > tol)
+ {
+ fail=true;
+ if(print_max_errs-- > 0) {
+ std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j in2: " << in2[i] << " + " << in2[i+1] << "j";
+ std::cout << " tolerance was: " << tol << std::endl;
+ }
+ }
+ }
+ // the primary test is the percent different greater than given tol
+ else if((err / norm) > tol) {
+ fail=true;
+ if(print_max_errs-- > 0) {
+ std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j in2: " << in2[i] << " + " << in2[i+1] << "j";
+ std::cout << " tolerance was: " << tol << std::endl;
+ }
+ }
+ }
+
+ return fail;
+}
+
+template <class t>
+bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol) {
+ bool fail = false;
+ int print_max_errs = 10;
+ for(unsigned int i=0; i<vlen; i++) {
+ if(((unsigned int)abs(int(((t *)(in1))[i]) - int(((t *)(in2))[i]))) > tol) {
+ fail=true;
+ if(print_max_errs-- > 0) {
+ std::cout << "offset " << i << " in1: " << static_cast<int>(t(((t *)(in1))[i])) << " in2: " << static_cast<int>(t(((t *)(in2))[i]));
+ std::cout << " tolerance was: " << tol << std::endl;
+ }
+ }
+ }
+
+ return fail;
+}
+
+class volk_qa_aligned_mem_pool{
+public:
+ void *get_new(size_t size){
+ size_t alignment = volk_get_alignment();
+ void* ptr = volk_malloc(size, alignment);
+ memset(ptr, 0x00, size);
+ _mems.push_back(ptr);
+ return ptr;
+ }
+ ~volk_qa_aligned_mem_pool() {
+ for(unsigned int ii = 0; ii < _mems.size(); ++ii) {
+ volk_free(_mems[ii]);
+ }
+ }
+private: std::vector<void * > _mems;
+};
+
+bool run_volk_tests(volk_func_desc_t desc,
+ void (*manual_func)(),
+ std::string name,
+ volk_test_params_t test_params,
+ std::vector<volk_test_results_t> *results,
+ std::string puppet_master_name
+)
+{
+ return run_volk_tests(desc, manual_func, name, test_params.tol(), test_params.scalar(),
+ test_params.vlen(), test_params.iter(), results, puppet_master_name,
+ test_params.benchmark_mode());
+}
+
+bool run_volk_tests(volk_func_desc_t desc,
+ void (*manual_func)(),
+ std::string name,
+ float tol,
+ lv_32fc_t scalar,
+ unsigned int vlen,
+ unsigned int iter,
+ std::vector<volk_test_results_t> *results,
+ std::string puppet_master_name,
+ bool benchmark_mode
+) {
+ // Initialize this entry in results vector
+ results->push_back(volk_test_results_t());
+ results->back().name = name;
+ results->back().vlen = vlen;
+ results->back().iter = iter;
+ std::cout << "RUN_VOLK_TESTS: " << name << "(" << vlen << "," << iter << ")" << std::endl;
+
+ // vlen_twiddle will increase vlen for malloc and data generation
+ // but kernels will still be called with the user provided vlen.
+ // This is useful for causing errors in kernels that do bad reads
+ const unsigned int vlen_twiddle = 5;
+ vlen = vlen + vlen_twiddle;
+
+ const float tol_f = tol;
+ const unsigned int tol_i = static_cast<const unsigned int>(tol);
+
+ //first let's get a list of available architectures for the test
+ std::vector<std::string> arch_list = get_arch_list(desc);
+
+ if((!benchmark_mode) && (arch_list.size() < 2)) {
+ std::cout << "no architectures to test" << std::endl;
+ return false;
+ }
+
+ //something that can hang onto memory and cleanup when this function exits
+ volk_qa_aligned_mem_pool mem_pool;
+
+ //now we have to get a function signature by parsing the name
+ std::vector<volk_type_t> inputsig, outputsig;
+ try {
+ get_signatures_from_name(inputsig, outputsig, name);
+ }
+ catch (boost::bad_lexical_cast& error) {
+ std::cerr << "Error: unable to get function signature from kernel name" << std::endl;
+ std::cerr << " - " << name << std::endl;
+ return false;
+ }
+
+ //pull the input scalars into their own vector
+ std::vector<volk_type_t> inputsc;
+ for(size_t i=0; i<inputsig.size(); i++) {
+ if(inputsig[i].is_scalar) {
+ inputsc.push_back(inputsig[i]);
+ inputsig.erase(inputsig.begin() + i);
+ i -= 1;
+ }
+ }
+ std::vector<void *> inbuffs;
+ BOOST_FOREACH(volk_type_t sig, inputsig) {
+ if(!sig.is_scalar) //we don't make buffers for scalars
+ inbuffs.push_back(mem_pool.get_new(vlen*sig.size*(sig.is_complex ? 2 : 1)));
+ }
+ for(size_t i=0; i<inbuffs.size(); i++) {
+ load_random_data(inbuffs[i], inputsig[i], vlen);
+ }
+
+ //ok let's make a vector of vector of void buffers, which holds the input/output vectors for each arch
+ std::vector<std::vector<void *> > test_data;
+ for(size_t i=0; i<arch_list.size(); i++) {
+ std::vector<void *> arch_buffs;
+ for(size_t j=0; j<outputsig.size(); j++) {
+ arch_buffs.push_back(mem_pool.get_new(vlen*outputsig[j].size*(outputsig[j].is_complex ? 2 : 1)));
+ }
+ for(size_t j=0; j<inputsig.size(); j++) {
+ void *arch_inbuff = mem_pool.get_new(vlen*inputsig[j].size*(inputsig[j].is_complex ? 2 : 1));
+ memcpy(arch_inbuff, inbuffs[j], vlen * inputsig[j].size * (inputsig[j].is_complex ? 2 : 1));
+ arch_buffs.push_back(arch_inbuff);
+ }
+ test_data.push_back(arch_buffs);
+ }
+
+ std::vector<volk_type_t> both_sigs;
+ both_sigs.insert(both_sigs.end(), outputsig.begin(), outputsig.end());
+ both_sigs.insert(both_sigs.end(), inputsig.begin(), inputsig.end());
+
+ //now run the test
+ vlen = vlen - vlen_twiddle;
+ clock_t start, end;
+ std::vector<double> profile_times;
+ for(size_t i = 0; i < arch_list.size(); i++) {
+ start = clock();
+
+ switch(both_sigs.size()) {
+ case 1:
+ if(inputsc.size() == 0) {
+ run_cast_test1((volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+ if(inputsc[0].is_complex) {
+ run_cast_test1_s32fc((volk_fn_1arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+ } else {
+ run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+ }
+ } else throw "unsupported 1 arg function >1 scalars";
+ break;
+ case 2:
+ if(inputsc.size() == 0) {
+ run_cast_test2((volk_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+ if(inputsc[0].is_complex) {
+ run_cast_test2_s32fc((volk_fn_2arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+ } else {
+ run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+ }
+ } else throw "unsupported 2 arg function >1 scalars";
+ break;
+ case 3:
+ if(inputsc.size() == 0) {
+ run_cast_test3((volk_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+ if(inputsc[0].is_complex) {
+ run_cast_test3_s32fc((volk_fn_3arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+ } else {
+ run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+ }
+ } else throw "unsupported 3 arg function >1 scalars";
+ break;
+ case 4:
+ run_cast_test4((volk_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ break;
+ default:
+ throw "no function handler for this signature";
+ break;
+ }
+
+ end = clock();
+ double arch_time = 1000.0 * (double)(end-start)/(double)CLOCKS_PER_SEC;
+ std::cout << arch_list[i] << " completed in " << arch_time << "ms" << std::endl;
+ volk_test_time_t result;
+ result.name = arch_list[i];
+ result.time = arch_time;
+ result.units = "ms";
+ result.pass = true;
+ results->back().results[result.name] = result;
+
+ profile_times.push_back(arch_time);
+ }
+
+ //and now compare each output to the generic output
+ //first we have to know which output is the generic one, they aren't in order...
+ size_t generic_offset=0;
+ for(size_t i=0; i<arch_list.size(); i++) {
+ if (arch_list[i] == "generic") {
+ generic_offset = i;
+ }
+ }
+
+ // Just in case a kernel wrote to OOB memory, use the twiddled vlen
+ vlen = vlen + vlen_twiddle;
+ bool fail;
+ bool fail_global = false;
+ std::vector<bool> arch_results;
+ for(size_t i=0; i<arch_list.size(); i++) {
+ fail = false;
+ if(i != generic_offset) {
+ for(size_t j=0; j<both_sigs.size(); j++) {
+ if(both_sigs[j].is_float) {
+ if(both_sigs[j].size == 8) {
+ if (both_sigs[j].is_complex) {
+ fail = ccompare((double *) test_data[generic_offset][j], (double *) test_data[i][j], vlen, tol_f);
+ } else {
+ fail = fcompare((double *) test_data[generic_offset][j], (double *) test_data[i][j], vlen, tol_f);
+ }
+ } else {
+ if (both_sigs[j].is_complex) {
+ fail = ccompare((float *) test_data[generic_offset][j], (float *) test_data[i][j], vlen, tol_f);
+ } else {
+ fail = fcompare((float *) test_data[generic_offset][j], (float *) test_data[i][j], vlen, tol_f);
+ }
+ }
+ } else {
+ //i could replace this whole switch statement with a memcmp if i wasn't interested in printing the outputs where they differ
+ switch(both_sigs[j].size) {
+ case 8:
+ if(both_sigs[j].is_signed) {
+ fail = icompare((int64_t *) test_data[generic_offset][j], (int64_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i);
+ } else {
+ fail = icompare((uint64_t *) test_data[generic_offset][j], (uint64_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i);
+ }
+ break;
+ case 4:
+ if(both_sigs[j].is_complex) {
+ if(both_sigs[j].is_signed) {
+ fail = icompare((int16_t *) test_data[generic_offset][j], (int16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i);
+ } else {
+ fail = icompare((uint16_t *) test_data[generic_offset][j], (uint16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i);
+ }
+ }
+ else {
+ if (both_sigs[j].is_signed) {
+ fail = icompare((int32_t *) test_data[generic_offset][j], (int32_t *) test_data[i][j],
+ vlen * (both_sigs[j].is_complex ? 2 : 1), tol_i);
+ } else {
+ fail = icompare((uint32_t *) test_data[generic_offset][j], (uint32_t *) test_data[i][j],
+ vlen * (both_sigs[j].is_complex ? 2 : 1), tol_i);
+ }
+ }
+ break;
+ case 2:
+ if(both_sigs[j].is_signed) {
+ fail = icompare((int16_t *) test_data[generic_offset][j], (int16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i);
+ } else {
+ fail = icompare((uint16_t *) test_data[generic_offset][j], (uint16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i);
+ }
+ break;
+ case 1:
+ if(both_sigs[j].is_signed) {
+ fail = icompare((int8_t *) test_data[generic_offset][j], (int8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i);
+ } else {
+ fail = icompare((uint8_t *) test_data[generic_offset][j], (uint8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i);
+ }
+ break;
+ default:
+ fail=1;
+ }
+ }
+ if(fail) {
+ volk_test_time_t *result = &results->back().results[arch_list[i]];
+ result->pass = !fail;
+ fail_global = true;
+ std::cout << name << ": fail on arch " << arch_list[i] << std::endl;
+ }
+ }
+ }
+ arch_results.push_back(!fail);
+ }
+
+ double best_time_a = std::numeric_limits<double>::max();
+ double best_time_u = std::numeric_limits<double>::max();
+ std::string best_arch_a = "generic";
+ std::string best_arch_u = "generic";
+ for(size_t i=0; i < arch_list.size(); i++)
+ {
+ if((profile_times[i] < best_time_u) && arch_results[i] && desc.impl_alignment[i] == 0)
+ {
+ best_time_u = profile_times[i];
+ best_arch_u = arch_list[i];
+ }
+ if((profile_times[i] < best_time_a) && arch_results[i])
+ {
+ best_time_a = profile_times[i];
+ best_arch_a = arch_list[i];
+ }
+ }
+
+ std::cout << "Best aligned arch: " << best_arch_a << std::endl;
+ std::cout << "Best unaligned arch: " << best_arch_u << std::endl;
+
+ if(puppet_master_name == "NULL") {
+ results->back().config_name = name;
+ } else {
+ results->back().config_name = puppet_master_name;
+ }
+ results->back().best_arch_a = best_arch_a;
+ results->back().best_arch_u = best_arch_u;
+
+ return fail_global;
+}
--- /dev/null
+#ifndef VOLK_QA_UTILS_H
+#define VOLK_QA_UTILS_H
+
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <map>
+#include <volk/volk.h>
+#include <volk/volk_common.h>
+
+/************************************************
+ * VOLK QA type definitions *
+ ************************************************/
+struct volk_type_t {
+ bool is_float;
+ bool is_scalar;
+ bool is_signed;
+ bool is_complex;
+ int size;
+ std::string str;
+};
+
+class volk_test_time_t {
+ public:
+ std::string name;
+ double time;
+ std::string units;
+ bool pass;
+};
+
+class volk_test_results_t {
+ public:
+ std::string name;
+ std::string config_name;
+ unsigned int vlen;
+ unsigned int iter;
+ std::map<std::string, volk_test_time_t> results;
+ std::string best_arch_a;
+ std::string best_arch_u;
+};
+
+class volk_test_params_t {
+ private:
+ float _tol;
+ lv_32fc_t _scalar;
+ unsigned int _vlen;
+ unsigned int _iter;
+ bool _benchmark_mode;
+ std::string _kernel_regex;
+ public:
+ // ctor
+ volk_test_params_t(float tol, lv_32fc_t scalar, unsigned int vlen, unsigned int iter,
+ bool benchmark_mode, std::string kernel_regex) :
+ _tol(tol), _scalar(scalar), _vlen(vlen), _iter(iter),
+ _benchmark_mode(benchmark_mode), _kernel_regex(kernel_regex) {};
+ // getters
+ float tol() {return _tol;};
+ lv_32fc_t scalar() {return _scalar;};
+ unsigned int vlen() {return _vlen;};
+ unsigned int iter() {return _iter;};
+ bool benchmark_mode() {return _benchmark_mode;};
+ std::string kernel_regex() {return _kernel_regex;};
+};
+
+class volk_test_case_t {
+ private:
+ volk_func_desc_t _desc;
+ void(*_kernel_ptr)();
+ std::string _name;
+ volk_test_params_t _test_parameters;
+ std::string _puppet_master_name;
+ public:
+ volk_func_desc_t desc() {return _desc;};
+ void (*kernel_ptr()) () {return _kernel_ptr;};
+ std::string name() {return _name;};
+ std::string puppet_master_name() {return _puppet_master_name;};
+ volk_test_params_t test_parameters() {return _test_parameters;};
+ // normal ctor
+ volk_test_case_t(volk_func_desc_t desc, void(*kernel_ptr)(), std::string name,
+ volk_test_params_t test_parameters) :
+ _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters),
+ _puppet_master_name("NULL")
+ {};
+ // ctor for puppets
+ volk_test_case_t(volk_func_desc_t desc, void(*kernel_ptr)(), std::string name,
+ std::string puppet_master_name, volk_test_params_t test_parameters) :
+ _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters),
+ _puppet_master_name(puppet_master_name)
+ {};
+};
+
+/************************************************
+ * VOLK QA functions *
+ ************************************************/
+volk_type_t volk_type_from_string(std::string);
+
+float uniform(void);
+void random_floats(float *buf, unsigned n);
+
+bool run_volk_tests(
+ volk_func_desc_t,
+ void(*)(),
+ std::string,
+ volk_test_params_t,
+ std::vector<volk_test_results_t> *results = NULL,
+ std::string puppet_master_name = "NULL"
+ );
+
+bool run_volk_tests(
+ volk_func_desc_t,
+ void(*)(),
+ std::string,
+ float,
+ lv_32fc_t,
+ unsigned int,
+ unsigned int,
+ std::vector<volk_test_results_t> *results = NULL,
+ std::string puppet_master_name = "NULL",
+ bool benchmark_mode = false
+);
+
+
+#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) \
+ BOOST_AUTO_TEST_CASE(func##_test) { \
+ BOOST_CHECK_EQUAL(run_volk_tests( \
+ func##_get_func_desc(), (void (*)())func##_manual, \
+ std::string(#func), tol, scalar, len, iter, 0, "NULL"), \
+ 0); \
+ }
+#define VOLK_PROFILE(func, test_params, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, "NULL")
+#define VOLK_PUPPET_PROFILE(func, puppet_master_func, test_params, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, std::string(#puppet_master_func))
+typedef void (*volk_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place
+typedef void (*volk_fn_2arg)(void *, void *, unsigned int, const char*);
+typedef void (*volk_fn_3arg)(void *, void *, void *, unsigned int, const char*);
+typedef void (*volk_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*);
+typedef void (*volk_fn_1arg_s32f)(void *, float, unsigned int, const char*); //one input vector, one scalar float input
+typedef void (*volk_fn_2arg_s32f)(void *, void *, float, unsigned int, const char*);
+typedef void (*volk_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char*);
+typedef void (*volk_fn_1arg_s32fc)(void *, lv_32fc_t, unsigned int, const char*); //one input vector, one scalar float input
+typedef void (*volk_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*);
+typedef void (*volk_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*);
+
+#endif //VOLK_QA_UTILS_H
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2012-2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include "qa_utils.h"
+#include "kernel_tests.h"
+
+#include <volk/volk.h>
+
+#include <vector>
+#include <utility>
+#include <iostream>
+#include <fstream>
+
+void print_qa_xml(std::vector<volk_test_results_t> results, unsigned int nfails);
+
+int main()
+{
+ bool qa_ret_val = 0;
+
+ float def_tol = 1e-6;
+ lv_32fc_t def_scalar = 327.0;
+ int def_iter = 1;
+ int def_vlen = 131071;
+ bool def_benchmark_mode = true;
+ std::string def_kernel_regex = "";
+
+ volk_test_params_t test_params(def_tol, def_scalar, def_vlen, def_iter,
+ def_benchmark_mode, def_kernel_regex);
+ std::vector<volk_test_case_t> test_cases = init_test_list(test_params);
+
+ std::vector<std::string> qa_failures;
+ std::vector<volk_test_results_t> results;
+ // Test every kernel reporting failures when they occur
+ for(unsigned int ii = 0; ii < test_cases.size(); ++ii) {
+ bool qa_result = false;
+ volk_test_case_t test_case = test_cases[ii];
+ try {
+ qa_result = run_volk_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(),
+ test_case.test_parameters(), &results, test_case.puppet_master_name());
+ }
+ catch(...) {
+ // TODO: what exceptions might we need to catch and how do we handle them?
+ std::cerr << "Exception found on kernel: " << test_case.name() << std::endl;
+ qa_result = false;
+ }
+
+ if(qa_result) {
+ std::cerr << "Failure on " << test_case.name() << std::endl;
+ qa_failures.push_back(test_case.name());
+ }
+ }
+
+ // Generate XML results
+ print_qa_xml(results, qa_failures.size());
+
+ // Summarize QA results
+ std::cerr << "Kernel QA finished: " << qa_failures.size() << " failures out of "
+ << test_cases.size() << " tests." << std::endl;
+ if(qa_failures.size() > 0) {
+ std::cerr << "The following kernels failed QA:" << std::endl;
+ for(unsigned int ii = 0; ii < qa_failures.size(); ++ii) {
+ std::cerr << " " << qa_failures[ii] << std::endl;
+ }
+ qa_ret_val = 1;
+ }
+
+ return qa_ret_val;
+}
+
+/*
+ * This function prints qa results as XML output similar to output
+ * from Junit. For reference output see http://llg.cubic.org/docs/junit/
+ */
+void print_qa_xml(std::vector<volk_test_results_t> results, unsigned int nfails)
+{
+ std::ofstream qa_file;
+ qa_file.open(".unittest/kernels.xml");
+
+ qa_file << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" << std::endl;
+ qa_file << "<testsuites name=\"kernels\" " <<
+ "tests=\"" << results.size() << "\" " <<
+ "failures=\"" << nfails << "\" id=\"1\">" << std::endl;
+
+ // Results are in a vector by kernel. Each element has a result
+ // map containing time and arch name with test result
+ for(unsigned int ii=0; ii < results.size(); ++ii) {
+ volk_test_results_t result = results[ii];
+ qa_file << " <testsuite name=\"" << result.name << "\">" << std::endl;
+
+ std::map<std::string, volk_test_time_t>::iterator kernel_time_pair;
+ for(kernel_time_pair = result.results.begin(); kernel_time_pair != result.results.end(); ++kernel_time_pair) {
+ volk_test_time_t test_time = kernel_time_pair->second;
+ qa_file << " <testcase name=\"" << test_time.name << "\" " <<
+ "classname=\"" << result.name << "\" " <<
+ "time=\"" << test_time.time << "\">" << std::endl;
+ if(!test_time.pass)
+ qa_file << " <failure " <<
+ "message=\"fail on arch " << test_time.name << "\">" <<
+ "</failure>" << std::endl;
+ qa_file << " </testcase>" << std::endl;
+ }
+ qa_file << " </testsuite>" << std::endl;
+ }
+
+
+ qa_file << "</testsuites>" << std::endl;
+ qa_file.close();
+
+}
--- /dev/null
+/* -*- c -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include <volk/volk_malloc.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/*
+ * For #defines used to determine support for allocation functions,
+ * see: http://linux.die.net/man/3/aligned_alloc
+*/
+
+// Otherwise, test if we are a POSIX or X/Open system
+// This only has a restriction that alignment be a power of 2 and a
+// multiple of sizeof(void *).
+#if _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN
+
+void *volk_malloc(size_t size, size_t alignment)
+{
+ void *ptr;
+
+ // quoting posix_memalign() man page:
+ // "alignment must be a power of two and a multiple of sizeof(void *)"
+ // volk_get_alignment() could return 1 for some machines (e.g. generic_orc)
+ if (alignment == 1)
+ return malloc(size);
+
+ int err = posix_memalign(&ptr, alignment, size);
+ if(err == 0) {
+ return ptr;
+ }
+ else {
+ fprintf(stderr,
+ "VOLK: Error allocating memory "
+ "(posix_memalign: error %d: %s)\n", err, strerror(err));
+ return NULL;
+ }
+}
+
+void volk_free(void *ptr)
+{
+ free(ptr);
+}
+
+// _aligned_malloc has no restriction on size,
+// available on Windows since Visual C++ 2005
+#elif _MSC_VER >= 1400
+
+void *volk_malloc(size_t size, size_t alignment)
+{
+ void *ptr = _aligned_malloc(size, alignment);
+ if(ptr == NULL) {
+ fprintf(stderr, "VOLK: Error allocating memory (_aligned_malloc)\n");
+ }
+ return ptr;
+}
+
+void volk_free(void *ptr)
+{
+ _aligned_free(ptr);
+}
+
+// No standard handlers; we'll do it ourselves.
+#else // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN
+
+struct block_info
+{
+ void *real;
+};
+
+void *
+volk_malloc(size_t size, size_t alignment)
+{
+ void *real, *user;
+ struct block_info *info;
+
+ /* At least align to sizeof our struct */
+ if (alignment < sizeof(struct block_info))
+ alignment = sizeof(struct block_info);
+
+ /* Alloc */
+ real = malloc(size + (2 * alignment - 1));
+
+ /* Get pointer to the various zones */
+ user = (void *)((((uintptr_t) real) + sizeof(struct block_info) + alignment - 1) & ~(alignment - 1));
+ info = (struct block_info *)(((uintptr_t)user) - sizeof(struct block_info));
+
+ /* Store the info for the free */
+ info->real = real;
+
+ /* Return pointer to user */
+ return user;
+}
+
+void
+volk_free(void *ptr)
+{
+ struct block_info *info;
+
+ /* Get the real pointer */
+ info = (struct block_info *)(((uintptr_t)ptr) - sizeof(struct block_info));
+
+ /* Release real pointer */
+ free(info->real);
+}
+
+#endif // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN
+
+//#endif // _ISOC11_SOURCE
--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <volk/volk_prefs.h>
+
+void volk_get_config_path(char *path)
+{
+ if (!path) return;
+
+ const char *suffix = "/.volk/volk_config";
+ const char *suffix2 = "/volk/volk_config"; //non-hidden
+ char *home = NULL;
+
+ //allows config redirection via env variable
+ home = getenv("VOLK_CONFIGPATH");
+ if(home!=NULL){
+ strncpy(path,home,512);
+ strcat(path,suffix2);
+ return;
+ }
+
+ if (home == NULL) home = getenv("HOME");
+ if (home == NULL) home = getenv("APPDATA");
+ if (home == NULL){
+ path[0] = 0;
+ return;
+ }
+
+ strncpy(path, home, 512);
+ strcat(path, suffix);
+}
+
+size_t volk_load_preferences(volk_arch_pref_t **prefs_res)
+{
+ FILE *config_file;
+ char path[512], line[512];
+ size_t n_arch_prefs = 0;
+ volk_arch_pref_t *prefs = NULL;
+
+ //get the config path
+ volk_get_config_path(path);
+ if (!path[0]) return n_arch_prefs; //no prefs found
+ config_file = fopen(path, "r");
+ if(!config_file) return n_arch_prefs; //no prefs found
+
+ //reset the file pointer and write the prefs into volk_arch_prefs
+ while(fgets(line, sizeof(line), config_file) != NULL)
+ {
+ prefs = (volk_arch_pref_t *) realloc(prefs, (n_arch_prefs+1) * sizeof(*prefs));
+ volk_arch_pref_t *p = prefs + n_arch_prefs;
+ if(sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_", 5))
+ {
+ n_arch_prefs++;
+ }
+ }
+ fclose(config_file);
+ *prefs_res = prefs;
+ return n_arch_prefs;
+}
--- /dev/null
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include <volk_rank_archs.h>
+#include <volk/volk_prefs.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int volk_get_index(
+ const char *impl_names[], //list of implementations by name
+ const size_t n_impls, //number of implementations available
+ const char *impl_name //the implementation name to find
+){
+ unsigned int i;
+ for (i = 0; i < n_impls; i++) {
+ if(!strncmp(impl_names[i], impl_name, 20)) {
+ return i;
+ }
+ }
+ //TODO return -1;
+ //something terrible should happen here
+ fprintf(stderr, "Volk warning: no arch found, returning generic impl\n");
+ return volk_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now
+}
+
+int volk_rank_archs(
+ const char *kern_name, //name of the kernel to rank
+ const char *impl_names[], //list of implementations by name
+ const int* impl_deps, //requirement mask per implementation
+ const bool* alignment, //alignment status of each implementation
+ size_t n_impls, //number of implementations available
+ const bool align //if false, filter aligned implementations
+)
+{
+ size_t i;
+ static volk_arch_pref_t *volk_arch_prefs;
+ static size_t n_arch_prefs = 0;
+ static int prefs_loaded = 0;
+ if(!prefs_loaded) {
+ n_arch_prefs = volk_load_preferences(&volk_arch_prefs);
+ prefs_loaded = 1;
+ }
+
+ // If we've defined VOLK_GENERIC to be anything, always return the
+ // 'generic' kernel. Used in GR's QA code.
+ char *gen_env = getenv("VOLK_GENERIC");
+ if(gen_env) {
+ return volk_get_index(impl_names, n_impls, "generic");
+ }
+
+ //now look for the function name in the prefs list
+ for(i = 0; i < n_arch_prefs; i++)
+ {
+ if(!strncmp(kern_name, volk_arch_prefs[i].name, sizeof(volk_arch_prefs[i].name))) //found it
+ {
+ const char *impl_name = align? volk_arch_prefs[i].impl_a : volk_arch_prefs[i].impl_u;
+ return volk_get_index(impl_names, n_impls, impl_name);
+ }
+ }
+
+ //return the best index with the largest deps
+ size_t best_index_a = 0;
+ size_t best_index_u = 0;
+ int best_value_a = -1;
+ int best_value_u = -1;
+ for(i = 0; i < n_impls; i++)
+ {
+ const signed val = impl_deps[i];
+ if (alignment[i] && val > best_value_a)
+ {
+ best_index_a = i;
+ best_value_a = val;
+ }
+ if (!alignment[i] && val > best_value_u)
+ {
+ best_index_u = i;
+ best_value_u = val;
+ }
+ }
+
+ //when align and we found a best aligned, use it
+ if (align && best_value_a != -1) return best_index_a;
+
+ //otherwise return the best unaligned
+ return best_index_u;
+}
--- /dev/null
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_VOLK_RANK_ARCHS_H
+#define INCLUDED_VOLK_RANK_ARCHS_H
+
+#include <stdlib.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int volk_get_index(
+ const char *impl_names[], //list of implementations by name
+ const size_t n_impls, //number of implementations available
+ const char *impl_name //the implementation name to find
+);
+
+int volk_rank_archs(
+ const char *kern_name, //name of the kernel to rank
+ const char *impl_names[], //list of implementations by name
+ const int* impl_deps, //requirement mask per implementation
+ const bool* alignment, //alignment status of each implementation
+ size_t n_impls, //number of implementations available
+ const bool align //if false, filter aligned implementations
+);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /*INCLUDED_VOLK_RANK_ARCHS_H*/
--- /dev/null
+# Copyright 2013, 2014 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING. If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+
+########################################################################
+# Install python files and apps
+########################################################################
+include(VolkPython)
+
+VOLK_PYTHON_INSTALL(
+ FILES
+ __init__.py
+ cfg.py
+ volk_modtool_generate.py
+ DESTINATION ${VOLK_PYTHON_DIR}/volk_modtool
+ COMPONENT "volk"
+)
+
+VOLK_PYTHON_INSTALL(
+ PROGRAMS
+ volk_modtool
+ DESTINATION ${VOLK_RUNTIME_DIR}
+ COMPONENT "volk"
+)
--- /dev/null
+The volk_modtool tool is installed along with VOLK as a way of helping
+to construct, add to, and interogate the VOLK library or companion
+libraries.
+
+volk_modtool is installed into $prefix/bin.
+
+VOLK modtool enables creating standalone (out-of-tree) VOLK modules
+and provides a few tools for sharing VOLK kernels between VOLK
+modules. If you need to design or work with VOLK kernels away from
+the canonical VOLK library, this is the tool. If you need to tailor
+your own VOLK library for whatever reason, this is the tool.
+
+The canonical VOLK library installs a volk.h and a libvolk.so. Your
+own library will install volk_$name.h and libvolk_$name.so. Ya Gronk?
+Good.
+
+There isn't a substantial difference between the canonical VOLK
+module and any other VOLK module. They're all peers. Any module
+created via VOLK modtool will come complete with a default
+volk_modtool.cfg file associating the module with the base from which
+it came, its distinctive $name and its destination (or path). These
+values (created from user input if VOLK modtool runs without a
+user-supplied config file or a default config file) serve as default
+values for some VOLK modtool actions. It's more or less intended for
+the user to change directories to the top level of a created VOLK
+module and then run volk_modtool to take advantage of the values
+stored in the default volk_modtool.cfg file.
+
+Apart from creating new VOLK modules, VOLK modtool allows you to list
+the names of kernels in other modules, list the names of kernels in
+the current module, add kernels from another module into the current
+module, and remove kernels from the current module. When moving
+kernels between modules, VOLK modtool does its best to keep the qa
+and profiling code for those kernels intact. If the base has a test
+or a profiling call for some kernel, those calls will follow the
+kernel when VOLK modtool adds that kernel. If QA or profiling
+requires a puppet kernel, the puppet kernel will follow the original
+kernel when VOLK modtool adds that original kernel. VOLK modtool
+respects puppets.
+
+======================================================================
+
+Installing a new VOLK Library:
+
+Run the command "volk_modtool -i". This will ask you three questions:
+
+ name: // the name to give your VOLK library: volk_<name>
+ destination: // directory new source tree is built under -- must exists.
+ // It will create <directory>/volk_<name>
+ base: // the directory containing the original VOLK source code
+
+The name provided must be alphanumeric (and cannot start with a
+number). No special characters including dashes and underscores are
+allowed.
+
+This will build a new skeleton directory in the destination provided
+with the name volk_<name>. It will contain the necessary structure to
+build:
+
+ mkdir build
+ cd build
+ cmake -DCMAKE_INSTALL_PREFIX=/opt/volk ../
+ make
+ sudo make install
+
+Right now, the library is empty and contains no kernels. Kernels can
+be added from another VOLK library using the '-a' option. If not
+specified, the kernel will be extracted from the base VOLK
+directory. Using the '-b' allows us to specify another VOLK library to
+use for this purpose.
+
+ volk_modtool -a -n 32fc_x2_conjugate_dot_prod_32fc
+
+This will put the code for the new kernel into
+<destination>/volk_<name>/kernels/volk_<name>/
+
+Other kernels must be added by hand. See the following webpages for
+more information about creating VOLK kernels:
+ http://gnuradio.org/doc/doxygen/volk_guide.html
+ http://gnuradio.org/redmine/projects/gnuradio/wiki/Volk
+
+
+======================================================================
+
+OPTIONS
+
+Options for Adding and Removing Kernels:
+ -a, --add_kernel
+ Add kernel from existing VOLK module. Uses the base VOLK module
+ unless -b is used. Use -n to specify the kernel name.
+ Requires: -n.
+ Optional: -b
+
+ -A, --add_all_kernels
+ Add all kernels from existing VOLK module. Uses the base VOLK
+ module unless -b is used.
+ Optional: -b
+
+ -x, --remove_kernel
+ Remove kernel from module.
+ Required: -n.
+ Optional: -b
+
+Options for Listing Kernels:
+ -l, --list
+ Lists all kernels available in the base VOLK module.
+
+ -k, --kernels
+ Lists all kernels in this VOLK module.
+
+ -r, --remote-list
+ Lists all kernels in another VOLK module that is specified
+ using the -b option.
+
--- /dev/null
+#!/usr/bin/env python
+#
+# Copyright 2013, 2014 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING. If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+#
+
+from cfg import volk_modtool_config
+from volk_modtool_generate import volk_modtool
--- /dev/null
+#!/usr/bin/env python
+#
+# Copyright 2013, 2014 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING. If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+#
+
+import ConfigParser
+import sys
+import os
+import exceptions
+import re
+
+
+class volk_modtool_config:
+ def key_val_sub(self, num, stuff, section):
+ return re.sub('\$' + 'k' + str(num), stuff[num][0], (re.sub('\$' + str(num), stuff[num][1], section[1][num])));
+
+ def verify(self):
+ for i in self.verification:
+ self.verify_section(i)
+ def remap(self):
+ for i in self.remapification:
+ self.verify_section(i)
+
+ def verify_section(self, section):
+ stuff = self.cfg.items(section[0])
+ for i in range(len(section[1])):
+ eval(self.key_val_sub(i, stuff, section))
+ try:
+ val = eval(self.key_val_sub(i, stuff, section))
+ if val == False:
+ raise exceptions.ValueError
+ except ValueError:
+ raise exceptions.ValueError('Verification function returns False... key:%s, val:%s'%(stuff[i][0], stuff[i][1]))
+ except:
+ raise exceptions.IOError('bad configuration... key:%s, val:%s'%(stuff[i][0], stuff[i][1]))
+
+
+ def __init__(self, cfg=None):
+ self.config_name = 'config'
+ self.config_defaults = ['name', 'destination', 'base']
+ self.config_defaults_remap = ['1',
+ 'self.cfg.set(self.config_name, \'$k1\', os.path.realpath(os.path.expanduser(\'$1\')))',
+ 'self.cfg.set(self.config_name, \'$k2\', os.path.realpath(os.path.expanduser(\'$2\')))']
+
+ self.config_defaults_verify = ['re.match(\'[a-zA-Z0-9]+$\', \'$0\')',
+ 'os.path.exists(\'$1\')',
+ 'os.path.exists(\'$2\')']
+ self.remapification = [(self.config_name, self.config_defaults_remap)]
+ self.verification = [(self.config_name, self.config_defaults_verify)]
+ default = os.path.join(os.getcwd(), 'volk_modtool.cfg')
+ icfg = ConfigParser.RawConfigParser()
+ if cfg:
+ icfg.read(cfg)
+ elif os.path.exists(default):
+ icfg.read(default)
+ else:
+ print "Initializing config file..."
+ icfg.add_section(self.config_name)
+ for kn in self.config_defaults:
+ rv = raw_input("%s: "%(kn))
+ icfg.set(self.config_name, kn, rv)
+ self.cfg = icfg
+ self.remap()
+ self.verify()
+
+
+
+ def read_map(self, name, inp):
+ if self.cfg.has_section(name):
+ self.cfg.remove_section(name)
+ self.cfg.add_section(name)
+ for i in inp:
+ self.cfg.set(name, i, inp[i])
+
+ def get_map(self, name):
+ retval = {}
+ stuff = self.cfg.items(name)
+ for i in stuff:
+ retval[i[0]] = i[1]
+ return retval
+
+
+
+
+
+
+
--- /dev/null
+#!/usr/bin/env python
+#
+# Copyright 2013, 2014 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING. If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+#
+
+from volk_modtool import volk_modtool, volk_modtool_config
+from optparse import OptionParser, OptionGroup
+
+import exceptions
+import os
+import sys
+
+if __name__ == '__main__':
+ parser = OptionParser();
+ actions = OptionGroup(parser, 'Actions');
+ actions.add_option('-i', '--install', action='store_true',
+ help='Create a new volk module.')
+ parser.add_option('-b', '--base_path', action='store', default=None,
+ help='Base path for action. By default, volk_modtool.cfg loads this value.')
+ parser.add_option('-n', '--kernel_name', action='store', default=None,
+ help='Kernel name for action. No default')
+ parser.add_option('-c', '--config', action='store', dest='config_file', default=None,
+ help='Config file for volk_modtool. By default, volk_modtool.cfg in the local directory will be used/created.')
+ actions.add_option('-a', '--add_kernel', action='store_true',
+ help='Add kernel from existing volk module. Requires: -n. Optional: -b')
+ actions.add_option('-A', '--add_all_kernels', action='store_true',
+ help='Add all kernels from existing volk module. Optional: -b')
+ actions.add_option('-x', '--remove_kernel', action='store_true',
+ help='Remove kernel from module. Required: -n. Optional: -b')
+ actions.add_option('-l', '--list', action='store_true',
+ help='List all kernels in the base.')
+ actions.add_option('-k', '--kernels', action='store_true',
+ help='List all kernels in the module.')
+ actions.add_option('-r', '--remote_list', action='store_true',
+ help='List all available kernels in remote volk module. Requires: -b.')
+ actions.add_option('-m', '--moo', action='store_true',
+ help='Have you mooed today?')
+ parser.add_option_group(actions)
+
+ (options, args) = parser.parse_args();
+ if len(sys.argv) < 2:
+ parser.print_help()
+
+ elif options.moo:
+ print " (__) "
+ print " (oo) "
+ print " /------\/ "
+ print " / | || "
+ print " * /\---/\ "
+ print " ~~ ~~ "
+
+ else:
+ my_cfg = volk_modtool_config(options.config_file);
+
+ my_modtool = volk_modtool(my_cfg.get_map(my_cfg.config_name));
+
+
+ if options.install:
+ my_modtool.make_module_skeleton();
+ my_modtool.write_default_cfg(my_cfg.cfg);
+
+
+ if options.add_kernel:
+ if not options.kernel_name:
+ raise exceptions.IOError("This action requires the -n option.");
+ else:
+ name = options.kernel_name;
+ if options.base_path:
+ base = options.base_path;
+ else:
+ base = my_cfg.cfg.get(my_cfg.config_name, 'base');
+ my_modtool.import_kernel(name, base);
+
+ if options.remove_kernel:
+ if not options.kernel_name:
+ raise exceptions.IOError("This action requires the -n option.");
+ else:
+ name = options.kernel_name;
+ my_modtool.remove_kernel(name);
+
+ if options.add_all_kernels:
+
+ if options.base_path:
+ base = options.base_path;
+ else:
+ base = my_cfg.cfg.get(my_cfg.config_name, 'base');
+ kernelset = my_modtool.get_current_kernels(base);
+ for i in kernelset:
+ my_modtool.import_kernel(i, base);
+
+ if options.remote_list:
+ if not options.base_path:
+ raise exceptions.IOError("This action requires the -b option. Try -l or -k for listing kernels in the base or the module.")
+ else:
+ base = options.base_path;
+ kernelset = my_modtool.get_current_kernels(base);
+ for i in kernelset:
+ print i;
+
+ if options.list:
+ kernelset = my_modtool.get_current_kernels();
+ for i in kernelset:
+ print i;
+
+ if options.kernels:
+ dest = my_cfg.cfg.get(my_cfg.config_name, 'destination');
+ name = my_cfg.cfg.get(my_cfg.config_name, 'name');
+ base = os.path.join(dest, 'volk_' + name);
+ kernelset = my_modtool.get_current_kernels(base);
+ for i in kernelset:
+ print i;
--- /dev/null
+#
+# Copyright 2013, 2014 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING. If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+#
+
+import os
+import re
+import glob
+import exceptions
+from sets import Set
+
+
+class volk_modtool:
+ def __init__(self, cfg):
+ self.volk = re.compile('volk')
+ self.remove_after_underscore = re.compile("_.*")
+ self.volk_included = re.compile('INCLUDED_VOLK')
+ self.volk_run_tests = re.compile('^\s*VOLK_RUN_TESTS.*\n', re.MULTILINE)
+ self.volk_profile = re.compile('^\s*(VOLK_PROFILE|VOLK_PUPPET_PROFILE).*\n', re.MULTILINE)
+ self.volk_kernel_tests = re.compile('^\s*\((VOLK_INIT_TEST|VOLK_INIT_PUPP).*\n', re.MULTILINE)
+ self.volk_null_kernel = re.compile('^\s*;\n', re.MULTILINE)
+ self.my_dict = cfg
+ self.lastline = re.compile('\s*char path\[1024\];.*')
+ self.badassert = re.compile('^\s*assert\(toked\[0\] == "volk_.*\n', re.MULTILINE)
+ self.goodassert = ' assert(toked[0] == "volk");\n'
+ self.baderase = re.compile('^\s*toked.erase\(toked.begin\(\)\);.*\n', re.MULTILINE)
+ self.gooderase = ' toked.erase(toked.begin());\n toked.erase(toked.begin());\n'
+
+ def get_basename(self, base=None):
+ if not base:
+ base = self.my_dict['base']
+ candidate = base.split('/')[-1]
+ if len(candidate.split('_')) == 1:
+ return ''
+ else:
+ return candidate.split('_')[-1]
+
+ def get_current_kernels(self, base=None):
+ if not base:
+ base = self.my_dict['base']
+ name = self.get_basename()
+ else:
+ name = self.get_basename(base)
+ if name == '':
+ hdr_files = glob.glob(os.path.join(base, "kernels/volk/*.h"))
+ begins = re.compile("(?<=volk_).*")
+ else:
+ hdr_files = glob.glob(os.path.join(base, "kernels/volk_" + name + "/*.h"))
+ begins = re.compile("(?<=volk_" + name + "_).*")
+
+ datatypes = []
+ functions = []
+
+ for line in hdr_files:
+
+ subline = re.search(".*\.h.*", os.path.basename(line))
+ if subline:
+ subsubline = begins.search(subline.group(0))
+ if subsubline:
+ dtype = self.remove_after_underscore.sub("", subsubline.group(0))
+ subdtype = re.search("[0-9]+[A-z]+", dtype)
+ if subdtype:
+ datatypes.append(subdtype.group(0))
+
+ datatypes = set(datatypes)
+
+ for line in hdr_files:
+ for dt in datatypes:
+ if dt in line:
+ subline = re.search(begins.pattern[:-2] + dt + ".*(?=\.h)", line)
+ if subline:
+ functions.append(subline.group(0))
+
+ return set(functions)
+
+ def make_module_skeleton(self):
+ dest = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'])
+ if os.path.exists(dest):
+ raise exceptions.IOError("Destination %s already exits!" % dest)
+
+ if not os.path.exists(os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'kernels/volk_' + self.my_dict['name'])):
+ os.makedirs(os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'kernels/volk_' + self.my_dict['name']))
+
+ current_kernel_names = self.get_current_kernels()
+ need_ifdef_updates = ["constant.h", "volk_complex.h", "volk_malloc.h", "volk_prefs.h",
+ "volk_common.h", "volk_cpu.tmpl.h", "volk_config_fixed.tmpl.h",
+ "volk_typedefs.h", "volk.tmpl.h"]
+ for root, dirnames, filenames in os.walk(self.my_dict['base']):
+ for name in filenames:
+ t_table = map(lambda a: re.search(a, name), current_kernel_names)
+ t_table = set(t_table)
+ if (t_table == set([None])) or (name == "volk_32f_null_32f.h"):
+ infile = os.path.join(root, name)
+ instring = open(infile, 'r').read()
+ outstring = re.sub(self.volk, 'volk_' + self.my_dict['name'], instring)
+ # Update the header ifdef guards only where needed
+ if name in need_ifdef_updates:
+ outstring = re.sub(self.volk_included, 'INCLUDED_VOLK_' + self.my_dict['name'].upper(), outstring)
+ newname = re.sub(self.volk, 'volk_' + self.my_dict['name'], name)
+ if name == 'VolkConfig.cmake.in':
+ outstring = re.sub("VOLK", 'VOLK_' + self.my_dict['name'].upper(), outstring)
+ newname = "Volk%sConfig.cmake.in" % self.my_dict['name']
+
+ relpath = os.path.relpath(infile, self.my_dict['base'])
+ newrelpath = re.sub(self.volk, 'volk_' + self.my_dict['name'], relpath)
+ dest = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], os.path.dirname(newrelpath), newname)
+
+ if not os.path.exists(os.path.dirname(dest)):
+ os.makedirs(os.path.dirname(dest))
+ open(dest, 'w+').write(outstring)
+
+ infile = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'lib/kernel_tests.h')
+ instring = open(infile, 'r').read()
+ outstring = re.sub(self.volk_kernel_tests, '', instring)
+ outstring = re.sub(self.volk_null_kernel,' (VOLK_INIT_TEST(volk_' + self.my_dict['name'] + '_32f_null_32f, test_params))\n ;', outstring)
+ open(infile, 'w+').write(outstring)
+
+ infile = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'lib/qa_utils.cc')
+ instring = open(infile, 'r').read()
+ outstring = re.sub(self.badassert, self.goodassert, instring)
+ outstring = re.sub(self.baderase, self.gooderase, outstring)
+ open(infile, 'w+').write(outstring)
+
+ def write_default_cfg(self, cfg):
+ outfile = open(os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'volk_modtool.cfg'), 'wb')
+ cfg.write(outfile)
+ outfile.close()
+
+ def convert_kernel(self, oldvolk, name, base, inpath, top):
+ infile = os.path.join(inpath, 'kernels/' + top[:-1] + '/' + top + name + '.h')
+ instring = open(infile, 'r').read()
+ outstring = re.sub(oldvolk, 'volk_' + self.my_dict['name'], instring)
+ newname = 'volk_' + self.my_dict['name'] + '_' + name + '.h'
+ relpath = os.path.relpath(infile, base)
+ newrelpath = re.sub(oldvolk, 'volk_' + self.my_dict['name'], relpath)
+ dest = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], os.path.dirname(newrelpath), newname)
+
+ if not os.path.exists(os.path.dirname(dest)):
+ os.makedirs(os.path.dirname(dest))
+ open(dest, 'w+').write(outstring)
+
+ # copy orc proto-kernels if they exist
+ for orcfile in glob.glob(inpath + '/kernels/volk/asm/orc/' + top + name + '*.orc'):
+ if os.path.isfile(orcfile):
+ instring = open(orcfile, 'r').read()
+ outstring = re.sub(oldvolk, 'volk_' + self.my_dict['name'], instring)
+ newname = 'volk_' + self.my_dict['name'] + '_' + name + '.orc'
+ relpath = os.path.relpath(orcfile, base)
+ newrelpath = re.sub(oldvolk, 'volk_' + self.my_dict['name'], relpath)
+ dest = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], os.path.dirname(newrelpath), newname)
+ if not os.path.exists(os.path.dirname(dest)):
+ os.makedirs(os.path.dirname(dest))
+ open(dest, 'w+').write(outstring)
+
+ def remove_kernel(self, name):
+ basename = self.my_dict['name']
+ if len(basename) > 0:
+ top = 'volk_' + basename + '_'
+ else:
+ top = 'volk_'
+ base = os.path.join(self.my_dict['destination'], top[:-1])
+
+ if not name in self.get_current_kernels():
+ raise exceptions.IOError("Requested kernel %s is not in module %s" % (name, base))
+
+ inpath = os.path.abspath(base)
+ kernel = re.compile(name)
+ search_kernels = Set([kernel])
+ profile = re.compile('^\s*VOLK_PROFILE')
+ puppet = re.compile('^\s*VOLK_PUPPET')
+ src_dest = os.path.join(inpath, 'apps/', top[:-1] + '_profile.cc')
+ infile = open(src_dest)
+ otherlines = infile.readlines()
+ open(src_dest, 'w+').write('')
+
+ for otherline in otherlines:
+ write_okay = True
+ if kernel.search(otherline):
+ write_okay = False
+ if puppet.match(otherline):
+ args = re.search("(?<=VOLK_PUPPET_PROFILE).*", otherline)
+ m_func = args.group(0).split(',')[0]
+ func = re.search('(?<=' + top + ').*', m_func)
+ search_kernels.add(re.compile(func.group(0)))
+ if write_okay:
+ open(src_dest, 'a').write(otherline)
+
+ src_dest = os.path.join(inpath, 'lib/testqa.cc')
+ infile = open(src_dest)
+ otherlines = infile.readlines()
+ open(src_dest, 'w+').write('')
+
+ for otherline in otherlines:
+ write_okay = True
+
+ for kernel in search_kernels:
+ if kernel.search(otherline):
+ write_okay = False
+
+ if write_okay:
+ open(src_dest, 'a').write(otherline)
+
+ for kernel in search_kernels:
+ infile = os.path.join(inpath, 'kernels/' + top[:-1] + '/' + top + kernel.pattern + '.h')
+ print "Removing kernel %s" % kernel.pattern
+ if os.path.exists(infile):
+ os.remove(infile)
+ # remove the orc proto-kernels if they exist. There are no puppets here
+ # so just need to glob for files matching kernel name
+ print glob.glob(inpath + '/kernel/volk/asm/orc/' + top + name + '*.orc')
+ for orcfile in glob.glob(inpath + '/kernel/volk/asm/orc/' + top + name + '*.orc'):
+ print orcfile
+ if os.path.exists(orcfile):
+ os.remove(orcfile)
+
+ def import_kernel(self, name, base):
+ if not base:
+ base = self.my_dict['base']
+ basename = self.getbasename()
+ else:
+ basename = self.get_basename(base)
+ if not name in self.get_current_kernels(base):
+ raise exceptions.IOError("Requested kernel %s is not in module %s" % (name, base))
+
+ inpath = os.path.abspath(base)
+ if len(basename) > 0:
+ top = 'volk_' + basename + '_'
+ else:
+ top = 'volk_'
+ oldvolk = re.compile(top[:-1])
+
+ self.convert_kernel(oldvolk, name, base, inpath, top)
+
+ kernel = re.compile(name)
+ search_kernels = Set([kernel])
+
+ profile = re.compile('^\s*VOLK_PROFILE')
+ puppet = re.compile('^\s*VOLK_PUPPET')
+ infile = open(os.path.join(inpath, 'apps/', oldvolk.pattern + '_profile.cc'))
+ otherinfile = open(os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'apps/volk_' + self.my_dict['name'] + '_profile.cc'))
+ dest = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'apps/volk_' + self.my_dict['name'] + '_profile.cc')
+ lines = infile.readlines()
+ otherlines = otherinfile.readlines()
+ open(dest, 'w+').write('')
+ insert = False
+ inserted = False
+ for otherline in otherlines:
+
+ if self.lastline.match(otherline):
+ insert = True
+ if insert and not inserted:
+ inserted = True
+ for line in lines:
+ if kernel.search(line):
+ if profile.match(line):
+ outline = re.sub(oldvolk, 'volk_' + self.my_dict['name'], line)
+ open(dest, 'a').write(outline)
+ elif puppet.match(line):
+ outline = re.sub(oldvolk, 'volk_' + self.my_dict['name'], line)
+ open(dest, 'a').write(outline)
+ args = re.search("(?<=VOLK_PUPPET_PROFILE).*", line)
+ m_func = args.group(0).split(',')[0]
+ func = re.search('(?<=' + top + ').*', m_func)
+ search_kernels.add(re.compile(func.group(0)))
+ self.convert_kernel(oldvolk, func.group(0), base, inpath, top)
+ write_okay = True
+ for kernel in search_kernels:
+ if kernel.search(otherline):
+ write_okay = False
+ if write_okay:
+ open(dest, 'a').write(otherline)
+
+ for kernel in search_kernels:
+ print "Adding kernel %s from module %s" % (kernel.pattern, base)
+
+ infile = open(os.path.join(inpath, 'lib/testqa.cc'))
+ otherinfile = open(os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'lib/testqa.cc'))
+ dest = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'lib/testqa.cc')
+ lines = infile.readlines()
+ otherlines = otherinfile.readlines()
+ open(dest, 'w+').write('')
+ inserted = False
+ insert = False
+ for otherline in otherlines:
+ if re.match('\s*', otherline) is None or re.match('\s*#.*', otherline) is None:
+ insert = True
+ if insert and not inserted:
+ inserted = True
+ for line in lines:
+ for kernel in search_kernels:
+ if kernel.search(line):
+ if self.volk_run_tests.match(line):
+ outline = re.sub(oldvolk, 'volk_' + self.my_dict['name'], line)
+ open(dest, 'a').write(outline)
+ write_okay = True
+ for kernel in search_kernels:
+ if kernel.search(otherline):
+ write_okay = False
+ if write_okay:
+ open(dest, 'a').write(otherline)
--- /dev/null
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include <volk/volk_common.h>
+#include "volk_machines.h"
+#include <volk/volk_typedefs.h>
+#include <volk/volk_cpu.h>
+#include "volk_rank_archs.h"
+#include <volk/volk.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+static size_t __alignment = 0;
+static intptr_t __alignment_mask = 0;
+
+struct volk_machine *get_machine(void)
+{
+ extern struct volk_machine *volk_machines[];
+ extern unsigned int n_volk_machines;
+ static struct volk_machine *machine = NULL;
+
+ if(machine != NULL)
+ return machine;
+ else {
+ unsigned int max_score = 0;
+ unsigned int i;
+ struct volk_machine *max_machine = NULL;
+ for(i=0; i<n_volk_machines; i++) {
+ if(!(volk_machines[i]->caps & (~volk_get_lvarch()))) {
+ if(volk_machines[i]->caps > max_score) {
+ max_score = volk_machines[i]->caps;
+ max_machine = volk_machines[i];
+ }
+ }
+ }
+ machine = max_machine;
+ //printf("Using Volk machine: %s\n", machine->name);
+ __alignment = machine->alignment;
+ __alignment_mask = (intptr_t)(__alignment-1);
+ return machine;
+ }
+}
+
+void volk_list_machines(void)
+{
+ extern struct volk_machine *volk_machines[];
+ extern unsigned int n_volk_machines;
+
+ unsigned int i;
+ for(i=0; i<n_volk_machines; i++) {
+ if(!(volk_machines[i]->caps & (~volk_get_lvarch()))) {
+ printf("%s;", volk_machines[i]->name);
+ }
+ }
+ printf("\n");
+}
+
+const char* volk_get_machine(void)
+{
+ extern struct volk_machine *volk_machines[];
+ extern unsigned int n_volk_machines;
+ static struct volk_machine *machine = NULL;
+
+ if(machine != NULL)
+ return machine->name;
+ else {
+ unsigned int max_score = 0;
+ unsigned int i;
+ struct volk_machine *max_machine = NULL;
+ for(i=0; i<n_volk_machines; i++) {
+ if(!(volk_machines[i]->caps & (~volk_get_lvarch()))) {
+ if(volk_machines[i]->caps > max_score) {
+ max_score = volk_machines[i]->caps;
+ max_machine = volk_machines[i];
+ }
+ }
+ }
+ machine = max_machine;
+ return machine->name;
+ }
+}
+
+size_t volk_get_alignment(void)
+{
+ get_machine(); //ensures alignment is set
+ return __alignment;
+}
+
+bool volk_is_aligned(const void *ptr)
+{
+ return ((intptr_t)(ptr) & __alignment_mask) == 0;
+}
+
+#define LV_HAVE_GENERIC
+#define LV_HAVE_DISPATCHER
+
+#for $kern in $kernels
+
+#if $kern.has_dispatcher
+#include <volk/$(kern.name).h> //pulls in the dispatcher
+#end if
+
+static inline void __$(kern.name)_d($kern.arglist_full)
+{
+ #if $kern.has_dispatcher
+ $(kern.name)_dispatcher($kern.arglist_names);
+ return;
+ #end if
+
+ if (volk_is_aligned(
+ #set $num_open_parens = 0
+ #for $arg_type, $arg_name in $kern.args
+ #if '*' in $arg_type
+ VOLK_OR_PTR($arg_name,
+ #set $num_open_parens += 1
+ #end if
+ #end for
+ 0$(')'*$num_open_parens)
+ )){
+ $(kern.name)_a($kern.arglist_names);
+ }
+ else{
+ $(kern.name)_u($kern.arglist_names);
+ }
+}
+
+static inline void __init_$(kern.name)(void)
+{
+ const char *name = get_machine()->$(kern.name)_name;
+ const char **impl_names = get_machine()->$(kern.name)_impl_names;
+ const int *impl_deps = get_machine()->$(kern.name)_impl_deps;
+ const bool *alignment = get_machine()->$(kern.name)_impl_alignment;
+ const size_t n_impls = get_machine()->$(kern.name)_n_impls;
+ const size_t index_a = volk_rank_archs(name, impl_names, impl_deps, alignment, n_impls, true/*aligned*/);
+ const size_t index_u = volk_rank_archs(name, impl_names, impl_deps, alignment, n_impls, false/*unaligned*/);
+ $(kern.name)_a = get_machine()->$(kern.name)_impls[index_a];
+ $(kern.name)_u = get_machine()->$(kern.name)_impls[index_u];
+
+ assert($(kern.name)_a);
+ assert($(kern.name)_u);
+
+ $(kern.name) = &__$(kern.name)_d;
+}
+
+static inline void __$(kern.name)_a($kern.arglist_full)
+{
+ __init_$(kern.name)();
+ $(kern.name)_a($kern.arglist_names);
+}
+
+static inline void __$(kern.name)_u($kern.arglist_full)
+{
+ __init_$(kern.name)();
+ $(kern.name)_u($kern.arglist_names);
+}
+
+static inline void __$(kern.name)($kern.arglist_full)
+{
+ __init_$(kern.name)();
+ $(kern.name)($kern.arglist_names);
+}
+
+$kern.pname $(kern.name)_a = &__$(kern.name)_a;
+$kern.pname $(kern.name)_u = &__$(kern.name)_u;
+$kern.pname $(kern.name) = &__$(kern.name);
+
+void $(kern.name)_manual($kern.arglist_full, const char* impl_name)
+{
+ const int index = volk_get_index(
+ get_machine()->$(kern.name)_impl_names,
+ get_machine()->$(kern.name)_n_impls,
+ impl_name
+ );
+ get_machine()->$(kern.name)_impls[index](
+ $kern.arglist_names
+ );
+}
+
+volk_func_desc_t $(kern.name)_get_func_desc(void) {
+ const char **impl_names = get_machine()->$(kern.name)_impl_names;
+ const int *impl_deps = get_machine()->$(kern.name)_impl_deps;
+ const bool *alignment = get_machine()->$(kern.name)_impl_alignment;
+ const size_t n_impls = get_machine()->$(kern.name)_n_impls;
+ volk_func_desc_t desc = {
+ impl_names,
+ impl_deps,
+ alignment,
+ n_impls
+ };
+ return desc;
+}
+
+#end for
--- /dev/null
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_VOLK_RUNTIME
+#define INCLUDED_VOLK_RUNTIME
+
+#include <volk/volk_typedefs.h>
+#include <volk/volk_config_fixed.h>
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+#include <volk/volk_malloc.h>
+
+#include <stdlib.h>
+#include <stdbool.h>
+
+__VOLK_DECL_BEGIN
+
+typedef struct volk_func_desc
+{
+ const char **impl_names;
+ const int *impl_deps;
+ const bool *impl_alignment;
+ const size_t n_impls;
+} volk_func_desc_t;
+
+//! Prints a list of machines available
+VOLK_API void volk_list_machines(void);
+
+//! Returns the name of the machine this instance will use
+VOLK_API const char* volk_get_machine(void);
+
+//! Get the machine alignment in bytes
+VOLK_API size_t volk_get_alignment(void);
+
+/*!
+ * The VOLK_OR_PTR macro is a convenience macro
+ * for checking the alignment of a set of pointers.
+ * Example usage:
+ * volk_is_aligned(VOLK_OR_PTR((VOLK_OR_PTR(p0, p1), p2)))
+ */
+#define VOLK_OR_PTR(ptr0, ptr1) \
+ (const void *)(((intptr_t)(ptr0)) | ((intptr_t)(ptr1)))
+
+/*!
+ * Is the pointer on a machine alignment boundary?
+ *
+ * Note: for performance reasons, this function
+ * is not usable until another volk API call is made
+ * which will perform certain initialization tasks.
+ *
+ * \param ptr the pointer to some memory buffer
+ * \return 1 for alignment boundary, else 0
+ */
+VOLK_API bool volk_is_aligned(const void *ptr);
+
+#for $kern in $kernels
+
+//! A function pointer to the dispatcher implementation
+extern VOLK_API $kern.pname $kern.name;
+
+//! A function pointer to the fastest aligned implementation
+extern VOLK_API $kern.pname $(kern.name)_a;
+
+//! A function pointer to the fastest unaligned implementation
+extern VOLK_API $kern.pname $(kern.name)_u;
+
+//! Call into a specific implementation given by name
+extern VOLK_API void $(kern.name)_manual($kern.arglist_full, const char* impl_name);
+
+//! Get description paramaters for this kernel
+extern VOLK_API volk_func_desc_t $(kern.name)_get_func_desc(void);
+#end for
+
+__VOLK_DECL_END
+
+#endif /*INCLUDED_VOLK_RUNTIME*/
--- /dev/null
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_VOLK_CONFIG_FIXED_H
+#define INCLUDED_VOLK_CONFIG_FIXED_H
+
+#for $i, $arch in enumerate($archs)
+#define LV_$(arch.name.upper()) $i
+#end for
+
+#endif /*INCLUDED_VOLK_CONFIG_FIXED*/
--- /dev/null
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include <volk/volk_cpu.h>
+#include <volk/volk_config_fixed.h>
+#include <stdlib.h>
+#include <string.h>
+
+struct VOLK_CPU volk_cpu;
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+ #define VOLK_CPU_x86
+#endif
+
+#if defined(VOLK_CPU_x86)
+
+//implement get cpuid for gcc compilers using a system or local copy of cpuid.h
+#if defined(__GNUC__)
+ #include <cpuid.h>
+ #define cpuid_x86(op, r) __get_cpuid(op, (unsigned int *)r+0, (unsigned int *)r+1, (unsigned int *)r+2, (unsigned int *)r+3)
+ #define cpuid_x86_count(op, count, regs) __cpuid_count(op, count, *((unsigned int*)regs), *((unsigned int*)regs+1), *((unsigned int*)regs+2), *((unsigned int*)regs+3))
+
+ /* Return Intel AVX extended CPU capabilities register.
+ * This function will bomb on non-AVX-capable machines, so
+ * check for AVX capability before executing.
+ */
+ #if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3)) && defined(HAVE_XGETBV)
+ static inline unsigned long long _xgetbv(unsigned int index){
+ unsigned int eax, edx;
+ __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
+ return ((unsigned long long)edx << 32) | eax;
+ }
+ #define __xgetbv() _xgetbv(0)
+ #else
+ #define __xgetbv() 0
+ #endif
+
+//implement get cpuid for MSVC compilers using __cpuid intrinsic
+#elif defined(_MSC_VER) && defined(HAVE_INTRIN_H)
+ #include <intrin.h>
+ #define cpuid_x86(op, r) __cpuid(((int*)r), op)
+ #define cpuid_x86_count(op, count, regs) __cpuidex((int*)regs, op, count)
+
+ #if defined(_XCR_XFEATURE_ENABLED_MASK)
+ #define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
+ #else
+ #define __xgetbv() 0
+ #endif
+
+#else
+ #error "A get cpuid for volk is not available on this compiler..."
+#endif //defined(__GNUC__)
+
+#endif //defined(VOLK_CPU_x86)
+
+static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int count, unsigned int reg, unsigned int bit) {
+#if defined(VOLK_CPU_x86)
+ unsigned int regs[4] = {0};
+ cpuid_x86_count(level, count, regs);
+ return regs[reg] >> bit & 0x01;
+#else
+ return 0;
+#endif
+}
+
+static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsigned int bit) {
+#if defined(VOLK_CPU_x86)
+ unsigned int regs[4];
+ memset(regs, 0, sizeof(unsigned int)*4);
+ cpuid_x86(op, regs);
+ return regs[reg] >> bit & 0x01;
+#else
+ return 0;
+#endif
+}
+
+static inline unsigned int check_extended_cpuid(unsigned int val) {
+#if defined(VOLK_CPU_x86)
+ unsigned int regs[4];
+ memset(regs, 0, sizeof(unsigned int)*4);
+ cpuid_x86(0x80000000, regs);
+ return regs[0] >= val;
+#else
+ return 0;
+#endif
+}
+
+static inline unsigned int get_avx_enabled(void) {
+#if defined(VOLK_CPU_x86)
+ return __xgetbv() & 0x6;
+#else
+ return 0;
+#endif
+}
+
+static inline unsigned int get_avx2_enabled(void) {
+#if defined(VOLK_CPU_x86)
+ return __xgetbv() & 0x6;
+#else
+ return 0;
+#endif
+}
+
+//neon detection is linux specific
+#if defined(__arm__) && defined(__linux__)
+ #include <asm/hwcap.h>
+ #include <linux/auxvec.h>
+ #include <stdio.h>
+ #define VOLK_CPU_ARM
+#endif
+
+static int has_neon(void){
+#if defined(VOLK_CPU_ARM)
+ FILE *auxvec_f;
+ unsigned long auxvec[2];
+ unsigned int found_neon = 0;
+ auxvec_f = fopen("/proc/self/auxv", "rb");
+ if(!auxvec_f) return 0;
+
+ size_t r = 1;
+ //so auxv is basically 32b of ID and 32b of value
+ //so it goes like this
+ while(!found_neon && r) {
+ r = fread(auxvec, sizeof(unsigned long), 2, auxvec_f);
+ if((auxvec[0] == AT_HWCAP) && (auxvec[1] & HWCAP_NEON))
+ found_neon = 1;
+ }
+
+ fclose(auxvec_f);
+ return found_neon;
+#else
+ return 0;
+#endif
+}
+
+#for $arch in $archs
+static int i_can_has_$arch.name (void) {
+ #for $check, $params in $arch.checks
+ if ($(check)($(', '.join($params))) == 0) return 0;
+ #end for
+ return 1;
+}
+
+#end for
+
+#if defined(HAVE_FENV_H)
+ #if defined(FE_TONEAREST)
+ #include <fenv.h>
+ static inline void set_float_rounding(void){
+ fesetround(FE_TONEAREST);
+ }
+ #else
+ static inline void set_float_rounding(void){
+ //do nothing
+ }
+ #endif
+#elif defined(_MSC_VER)
+ #include <float.h>
+ static inline void set_float_rounding(void){
+ unsigned int cwrd;
+ _controlfp_s(&cwrd, 0, 0);
+ _controlfp_s(&cwrd, _RC_NEAR, _MCW_RC);
+ }
+#else
+ static inline void set_float_rounding(void){
+ //do nothing
+ }
+#endif
+
+void volk_cpu_init() {
+ #for $arch in $archs
+ volk_cpu.has_$arch.name = &i_can_has_$arch.name;
+ #end for
+ set_float_rounding();
+}
+
+unsigned int volk_get_lvarch() {
+ unsigned int retval = 0;
+ volk_cpu_init();
+ #for $arch in $archs
+ retval += volk_cpu.has_$(arch.name)() << LV_$(arch.name.upper());
+ #end for
+ return retval;
+}
--- /dev/null
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_VOLK_CPU_H
+#define INCLUDED_VOLK_CPU_H
+
+#include <volk/volk_common.h>
+
+__VOLK_DECL_BEGIN
+
+struct VOLK_CPU {
+ #for $arch in $archs
+ int (*has_$arch.name) ();
+ #end for
+};
+
+extern struct VOLK_CPU volk_cpu;
+
+void volk_cpu_init ();
+unsigned int volk_get_lvarch ();
+
+__VOLK_DECL_END
+
+#endif /*INCLUDED_VOLK_CPU_H*/
--- /dev/null
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#set $this_machine = $machine_dict[$args[0]]
+#set $arch_names = $this_machine.arch_names
+
+#for $arch in $this_machine.archs
+#define LV_HAVE_$(arch.name.upper()) 1
+#end for
+
+#include <volk/volk_common.h>
+#include "volk_machines.h"
+#include <volk/volk_config_fixed.h>
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#for $kern in $kernels
+#include <volk/$(kern.name).h>
+#end for
+
+########################################################################
+#def make_arch_have_list($archs)
+$(' | '.join(['(1 << LV_%s)'%a.name.upper() for a in $archs]))#slurp
+#end def
+
+########################################################################
+#def make_impl_name_list($impls)
+{$(', '.join(['"%s"'%i.name for i in $impls]))}#slurp
+#end def
+
+########################################################################
+#def make_impl_align_list($impls)
+{$(', '.join(['true' if i.is_aligned else 'false' for i in $impls]))}#slurp
+#end def
+
+########################################################################
+#def make_impl_deps_list($impls)
+{$(', '.join([' | '.join(['(1 << LV_%s)'%d.upper() for d in i.deps]) for i in $impls]))}#slurp
+#end def
+
+########################################################################
+#def make_impl_fcn_list($name, $impls)
+{$(', '.join(['%s_%s'%($name, i.name) for i in $impls]))}#slurp
+#end def
+
+struct volk_machine volk_machine_$(this_machine.name) = {
+ $make_arch_have_list($this_machine.archs),
+ "$this_machine.name",
+ $this_machine.alignment,
+ #for $kern in $kernels
+ #set $impls = $kern.get_impls($arch_names)
+ "$kern.name", ##//kernel name
+ $make_impl_name_list($impls), ##//list of kernel implementations by name
+ $make_impl_deps_list($impls), ##//list of arch dependencies per implementation
+ $make_impl_align_list($impls), ##//alignment required? for each implementation
+ $make_impl_fcn_list($kern.name, $impls), ##//pointer to each implementation
+ $(len($impls)), ##//number of implementations listed here
+ #end for
+};
--- /dev/null
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include <volk/volk_common.h>
+#include <volk/volk_typedefs.h>
+#include "volk_machines.h"
+
+struct volk_machine *volk_machines[] = {
+#for $machine in $machines
+#ifdef LV_MACHINE_$(machine.name.upper())
+&volk_machine_$(machine.name),
+#endif
+#end for
+};
+
+unsigned int n_volk_machines = sizeof(volk_machines)/sizeof(*volk_machines);
--- /dev/null
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_LIBVOLK_MACHINES_H
+#define INCLUDED_LIBVOLK_MACHINES_H
+
+#include <volk/volk_common.h>
+#include <volk/volk_typedefs.h>
+
+#include <stdbool.h>
+#include <stdlib.h>
+
+__VOLK_DECL_BEGIN
+
+struct volk_machine {
+ const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_get_lvarch format)
+ const char *name;
+ const size_t alignment; //the maximum byte alignment required for functions in this library
+ #for $kern in $kernels
+ const char *$(kern.name)_name;
+ const char *$(kern.name)_impl_names[$(len($archs))];
+ const int $(kern.name)_impl_deps[$(len($archs))];
+ const bool $(kern.name)_impl_alignment[$(len($archs))];
+ const $(kern.pname) $(kern.name)_impls[$(len($archs))];
+ const size_t $(kern.name)_n_impls;
+ #end for
+};
+
+#for $machine in $machines
+#ifdef LV_MACHINE_$(machine.name.upper())
+extern struct volk_machine volk_machine_$(machine.name);
+#endif
+#end for
+
+__VOLK_DECL_END
+
+#endif //INCLUDED_LIBVOLK_MACHINES_H
--- /dev/null
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_VOLK_TYPEDEFS
+#define INCLUDED_VOLK_TYPEDEFS
+
+#include <inttypes.h>
+#include <volk/volk_complex.h>
+
+#for $kern in $kernels
+typedef void (*$(kern.pname))($kern.arglist_types);
+#end for
+
+#endif /*INCLUDED_VOLK_TYPEDEFS*/
--- /dev/null
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+LV_CXXFLAGS=@LV_CXXFLAGS@
+
+
+Name: volk
+Description: VOLK: Vector Optimized Library of Kernels
+Requires:
+Version: @LIBVER@
+Libs: -L${libdir} -lvolk
+Cflags: -I${includedir} ${LV_CXXFLAGS}